eval-leaderboard / data /tasks.json
jwilles's picture
Update display names
159f31f
raw
history blame contribute delete
5.18 kB
{
"arc_easy": {
"benchmark": "arc_easy",
"metric": "accuracy",
"display_name": "ARC-E",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
},
"arc_challenge": {
"benchmark": "arc_challenge",
"metric": "accuracy",
"display_name": "ARC-C",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
},
"drop": {
"benchmark": "drop",
"metric": "mean",
"display_name": "DROP",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop"
},
"winogrande": {
"benchmark": "winogrande",
"metric": "accuracy",
"display_name": "WinoGrande",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande"
},
"gsm8k": {
"benchmark": "gsm8k",
"metric": "accuracy",
"display_name": "GSM8K",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k"
},
"hellaswag": {
"benchmark": "hellaswag",
"metric": "accuracy",
"display_name": "HellaSwag",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag"
},
"humaneval": {
"benchmark": "humaneval",
"metric": "mean",
"display_name": "HumanEval",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval"
},
"ifeval": {
"benchmark": "ifeval",
"metric": "final_acc",
"display_name": "IFEval",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval"
},
"math": {
"benchmark": "math",
"metric": "accuracy",
"display_name": "MATH",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics"
},
"mmlu": {
"benchmark": "mmlu",
"metric": "accuracy",
"display_name": "MMLU",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu"
},
"mmlu_pro": {
"benchmark": "mmlu_pro",
"metric": "accuracy",
"display_name": "MMLU-Pro",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro"
},
"gpqa_diamond": {
"benchmark": "gpqa_diamond",
"metric": "accuracy",
"display_name": "GPQA-D",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa"
},
"mmmu_multiple_choice": {
"benchmark": "mmmu_multiple_choice",
"metric": "accuracy",
"display_name": "MMMU-MC",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu"
},
"mmmu_open": {
"benchmark": "mmmu_open",
"metric": "accuracy",
"display_name": "MMMU-OE",
"type": "base",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu"
},
"gaia": {
"benchmark": "gaia",
"metric": "accuracy",
"display_name": "GAIA",
"type": "agentic",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
},
"gdm_intercode_ctf": {
"benchmark": "gdm_intercode_ctf",
"metric": "accuracy",
"display_name": "InterCode-CTF",
"type": "agentic",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf"
},
"gdm_in_house_ctf": {
"benchmark": "gdm_in_house_ctf",
"metric": "accuracy",
"display_name": "In-House-CTF",
"type": "agentic",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf"
},
"agentharm": {
"benchmark": "agentharm",
"metric": "avg_score",
"display_name": "AgentHarm",
"type": "agentic",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm"
},
"agentharm_benign": {
"benchmark": "agentharm_benign",
"metric": "avg_score",
"display_name": "AgentHarm-Benign",
"type": "agentic",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm"
},
"swe_bench": {
"benchmark": "swe_bench",
"metric": "mean",
"display_name": "SWE-Bench",
"type": "agentic",
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench"
}
}