|
{ |
|
"arc_easy": { |
|
"benchmark": "arc_easy", |
|
"metric": "accuracy", |
|
"display_name": "ARC-E", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc" |
|
}, |
|
"arc_challenge": { |
|
"benchmark": "arc_challenge", |
|
"metric": "accuracy", |
|
"display_name": "ARC-C", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc" |
|
}, |
|
"drop": { |
|
"benchmark": "drop", |
|
"metric": "mean", |
|
"display_name": "DROP", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop" |
|
}, |
|
"winogrande": { |
|
"benchmark": "winogrande", |
|
"metric": "accuracy", |
|
"display_name": "WinoGrande", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande" |
|
}, |
|
"gsm8k": { |
|
"benchmark": "gsm8k", |
|
"metric": "accuracy", |
|
"display_name": "GSM8K", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k" |
|
}, |
|
"hellaswag": { |
|
"benchmark": "hellaswag", |
|
"metric": "accuracy", |
|
"display_name": "HellaSwag", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag" |
|
}, |
|
"humaneval": { |
|
"benchmark": "humaneval", |
|
"metric": "mean", |
|
"display_name": "HumanEval", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval" |
|
}, |
|
"ifeval": { |
|
"benchmark": "ifeval", |
|
"metric": "final_acc", |
|
"display_name": "IFEval", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval" |
|
}, |
|
"math": { |
|
"benchmark": "math", |
|
"metric": "accuracy", |
|
"display_name": "MATH", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics" |
|
}, |
|
"mmlu": { |
|
"benchmark": "mmlu", |
|
"metric": "accuracy", |
|
"display_name": "MMLU", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu" |
|
}, |
|
"mmlu_pro": { |
|
"benchmark": "mmlu_pro", |
|
"metric": "accuracy", |
|
"display_name": "MMLU-Pro", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro" |
|
}, |
|
"gpqa_diamond": { |
|
"benchmark": "gpqa_diamond", |
|
"metric": "accuracy", |
|
"display_name": "GPQA-D", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa" |
|
}, |
|
"mmmu_multiple_choice": { |
|
"benchmark": "mmmu_multiple_choice", |
|
"metric": "accuracy", |
|
"display_name": "MMMU-MC", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu" |
|
}, |
|
"mmmu_open": { |
|
"benchmark": "mmmu_open", |
|
"metric": "accuracy", |
|
"display_name": "MMMU-OE", |
|
"type": "base", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu" |
|
}, |
|
"gaia": { |
|
"benchmark": "gaia", |
|
"metric": "accuracy", |
|
"display_name": "GAIA", |
|
"type": "agentic", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia" |
|
}, |
|
"gdm_intercode_ctf": { |
|
"benchmark": "gdm_intercode_ctf", |
|
"metric": "accuracy", |
|
"display_name": "InterCode-CTF", |
|
"type": "agentic", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf" |
|
}, |
|
"gdm_in_house_ctf": { |
|
"benchmark": "gdm_in_house_ctf", |
|
"metric": "accuracy", |
|
"display_name": "In-House-CTF", |
|
"type": "agentic", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf" |
|
}, |
|
"agentharm": { |
|
"benchmark": "agentharm", |
|
"metric": "avg_score", |
|
"display_name": "AgentHarm", |
|
"type": "agentic", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm" |
|
}, |
|
"agentharm_benign": { |
|
"benchmark": "agentharm_benign", |
|
"metric": "avg_score", |
|
"display_name": "AgentHarm-Benign", |
|
"type": "agentic", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm" |
|
}, |
|
"swe_bench": { |
|
"benchmark": "swe_bench", |
|
"metric": "mean", |
|
"display_name": "SWE-Bench", |
|
"type": "agentic", |
|
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench" |
|
} |
|
} |