{ "arc_easy": { "benchmark": "arc_easy", "metric": "accuracy", "display_name": "ARC-E", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc" }, "arc_challenge": { "benchmark": "arc_challenge", "metric": "accuracy", "display_name": "ARC-C", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc" }, "drop": { "benchmark": "drop", "metric": "mean", "display_name": "DROP", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop" }, "winogrande": { "benchmark": "winogrande", "metric": "accuracy", "display_name": "WinoGrande", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande" }, "gsm8k": { "benchmark": "gsm8k", "metric": "accuracy", "display_name": "GSM8K", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k" }, "hellaswag": { "benchmark": "hellaswag", "metric": "accuracy", "display_name": "HellaSwag", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag" }, "humaneval": { "benchmark": "humaneval", "metric": "mean", "display_name": "HumanEval", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval" }, "ifeval": { "benchmark": "ifeval", "metric": "final_acc", "display_name": "IFEval", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval" }, "math": { "benchmark": "math", "metric": "accuracy", "display_name": "MATH", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics" }, "mmlu": { "benchmark": "mmlu", "metric": "accuracy", "display_name": "MMLU", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu" }, "mmlu_pro": { "benchmark": "mmlu_pro", "metric": "accuracy", "display_name": "MMLU-Pro", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro" }, "gpqa_diamond": { "benchmark": "gpqa_diamond", "metric": "accuracy", "display_name": "GPQA-D", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa" }, "mmmu_multiple_choice": { "benchmark": "mmmu_multiple_choice", "metric": "accuracy", "display_name": "MMMU-MC", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu" }, "mmmu_open": { "benchmark": "mmmu_open", "metric": "accuracy", "display_name": "MMMU-OE", "type": "base", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu" }, "gaia": { "benchmark": "gaia", "metric": "accuracy", "display_name": "GAIA", "type": "agentic", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia" }, "gdm_intercode_ctf": { "benchmark": "gdm_intercode_ctf", "metric": "accuracy", "display_name": "InterCode-CTF", "type": "agentic", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf" }, "gdm_in_house_ctf": { "benchmark": "gdm_in_house_ctf", "metric": "accuracy", "display_name": "In-House-CTF", "type": "agentic", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf" }, "agentharm": { "benchmark": "agentharm", "metric": "avg_score", "display_name": "AgentHarm", "type": "agentic", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm" }, "agentharm_benign": { "benchmark": "agentharm_benign", "metric": "avg_score", "display_name": "AgentHarm-Benign", "type": "agentic", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm" }, "swe_bench": { "benchmark": "swe_bench", "metric": "mean", "display_name": "SWE-Bench", "type": "agentic", "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench" } }