GenerativeReasoningBenchmark / data /success_rate_0402.json
zhwang4ai's picture
init leaderboard
f49345e
{
"RelicEnv": {
"qwen2.5-3b-instruct": 0.18,
"qwen2.5-7b-instruct": 0.396,
"qwen2.5-14b-instruct": 0.8,
"qwen2.5-32b-instruct": 0.8560000000000001,
"qwen2.5-72b-instruct": 0.892,
"llama-3.1-8b-instruct": 0.21600000000000003,
"llama-3.1-70b-instruct": 0.6639999999999999,
"llama-3.2-3b-instruct": 0.164,
"llama-3.3-70b-instruct": 0.836,
"mistral-large-instruct-2411": 0.8560000000000001,
"gemma-2-27b-it": 0.544,
"gemma-2-9b-it": 0.36400000000000005,
"deepseek-v3": 0.9359999999999999,
"deepseek-r1": 0.916,
"qwq-32b": 0.9560000000000001,
"Average": 0.6384
},
"HerbEnv": {
"qwen2.5-3b-instruct": 0.184,
"qwen2.5-7b-instruct": 0.304,
"qwen2.5-14b-instruct": 0.784,
"qwen2.5-32b-instruct": 0.8400000000000001,
"qwen2.5-72b-instruct": 0.8039999999999999,
"llama-3.1-8b-instruct": 0.30000000000000004,
"llama-3.1-70b-instruct": 0.568,
"llama-3.2-3b-instruct": 0.128,
"llama-3.3-70b-instruct": 0.612,
"mistral-large-instruct-2411": 0.76,
"gemma-2-27b-it": 0.504,
"gemma-2-9b-it": 0.18000000000000002,
"deepseek-v3": 0.968,
"deepseek-r1": 0.9359999999999999,
"qwq-32b": 0.924,
"Average": 0.5863999999999999
},
"TransdimensionalEnv": {
"qwen2.5-3b-instruct": 0.156,
"qwen2.5-7b-instruct": 0.38400000000000006,
"qwen2.5-14b-instruct": 0.836,
"qwen2.5-32b-instruct": 0.876,
"qwen2.5-72b-instruct": 0.836,
"llama-3.1-8b-instruct": 0.44399999999999995,
"llama-3.1-70b-instruct": 0.828,
"llama-3.2-3b-instruct": 0.172,
"llama-3.3-70b-instruct": 0.86,
"mistral-large-instruct-2411": 0.86,
"gemma-2-27b-it": 0.5599999999999999,
"gemma-2-9b-it": 0.516,
"deepseek-v3": 0.968,
"deepseek-r1": 0.9359999999999999,
"qwq-32b": 0.968,
"Average": 0.6799999999999999
},
"SorcererEnv": {
"qwen2.5-3b-instruct": 0.16,
"qwen2.5-7b-instruct": 0.32400000000000007,
"qwen2.5-14b-instruct": 0.8039999999999999,
"qwen2.5-32b-instruct": 0.8240000000000001,
"qwen2.5-72b-instruct": 0.8320000000000001,
"llama-3.1-8b-instruct": 0.276,
"llama-3.1-70b-instruct": 0.6639999999999999,
"llama-3.2-3b-instruct": 0.196,
"llama-3.3-70b-instruct": 0.7360000000000001,
"mistral-large-instruct-2411": 0.8,
"gemma-2-27b-it": 0.5640000000000001,
"gemma-2-9b-it": 0.28800000000000003,
"deepseek-v3": 0.8640000000000001,
"deepseek-r1": 0.8240000000000001,
"qwq-32b": 0.8400000000000001,
"Average": 0.5997333333333333
},
"QuantumEnv": {
"qwen2.5-3b-instruct": 0.196,
"qwen2.5-7b-instruct": 0.532,
"qwen2.5-14b-instruct": 0.8720000000000001,
"qwen2.5-32b-instruct": 0.9039999999999999,
"qwen2.5-72b-instruct": 0.916,
"llama-3.1-8b-instruct": 0.45600000000000007,
"llama-3.1-70b-instruct": 0.7999999999999999,
"llama-3.2-3b-instruct": 0.168,
"llama-3.3-70b-instruct": 0.8480000000000001,
"mistral-large-instruct-2411": 0.8720000000000001,
"gemma-2-27b-it": 0.744,
"gemma-2-9b-it": 0.544,
"deepseek-v3": 0.884,
"deepseek-r1": 0.8640000000000001,
"qwq-32b": 0.868,
"Average": 0.6978666666666666
},
"AstronomyEnv": {
"qwen2.5-3b-instruct": 0.172,
"qwen2.5-7b-instruct": 0.42800000000000005,
"qwen2.5-14b-instruct": 0.716,
"qwen2.5-32b-instruct": 0.676,
"qwen2.5-72b-instruct": 0.748,
"llama-3.1-8b-instruct": 0.336,
"llama-3.1-70b-instruct": 0.692,
"llama-3.2-3b-instruct": 0.176,
"llama-3.3-70b-instruct": 0.6519999999999999,
"mistral-large-instruct-2411": 0.7999999999999999,
"gemma-2-27b-it": 0.508,
"gemma-2-9b-it": 0.372,
"deepseek-v3": 0.748,
"deepseek-r1": 0.8200000000000001,
"qwq-32b": 0.852,
"Average": 0.5797333333333333
},
"MusicGenresEnv": {
"qwen2.5-3b-instruct": 0.22000000000000003,
"qwen2.5-7b-instruct": 0.42000000000000004,
"qwen2.5-14b-instruct": 0.72,
"qwen2.5-32b-instruct": 0.716,
"qwen2.5-72b-instruct": 0.696,
"llama-3.1-8b-instruct": 0.35200000000000004,
"llama-3.1-70b-instruct": 0.6280000000000001,
"llama-3.2-3b-instruct": 0.136,
"llama-3.3-70b-instruct": 0.592,
"mistral-large-instruct-2411": 0.732,
"gemma-2-27b-it": 0.44800000000000006,
"gemma-2-9b-it": 0.332,
"deepseek-v3": 0.748,
"deepseek-r1": 0.792,
"qwq-32b": 0.876,
"Average": 0.5605333333333334
},
"CloudEnv": {
"qwen2.5-3b-instruct": 0.21199999999999997,
"qwen2.5-7b-instruct": 0.42000000000000004,
"qwen2.5-14b-instruct": 0.76,
"qwen2.5-32b-instruct": 0.656,
"qwen2.5-72b-instruct": 0.712,
"llama-3.1-8b-instruct": 0.42000000000000004,
"llama-3.1-70b-instruct": 0.664,
"llama-3.2-3b-instruct": 0.22800000000000004,
"llama-3.3-70b-instruct": 0.696,
"mistral-large-instruct-2411": 0.8360000000000001,
"gemma-2-27b-it": 0.6,
"gemma-2-9b-it": 0.4,
"deepseek-v3": 0.8200000000000001,
"deepseek-r1": 0.908,
"qwq-32b": 0.9120000000000001,
"Average": 0.6162666666666667
},
"CuisineEnv": {
"qwen2.5-3b-instruct": 0.21600000000000003,
"qwen2.5-7b-instruct": 0.316,
"qwen2.5-14b-instruct": 0.6960000000000001,
"qwen2.5-32b-instruct": 0.664,
"qwen2.5-72b-instruct": 0.656,
"llama-3.1-8b-instruct": 0.22799999999999998,
"llama-3.1-70b-instruct": 0.476,
"llama-3.2-3b-instruct": 0.152,
"llama-3.3-70b-instruct": 0.44400000000000006,
"mistral-large-instruct-2411": 0.644,
"gemma-2-27b-it": 0.27599999999999997,
"gemma-2-9b-it": 0.156,
"deepseek-v3": 0.8400000000000001,
"deepseek-r1": 0.7959999999999999,
"qwq-32b": 0.8800000000000001,
"Average": 0.49599999999999994
},
"PlantEnv": {
"qwen2.5-3b-instruct": 0.168,
"qwen2.5-7b-instruct": 0.236,
"qwen2.5-14b-instruct": 0.34,
"qwen2.5-32b-instruct": 0.22000000000000003,
"qwen2.5-72b-instruct": 0.22799999999999998,
"llama-3.1-8b-instruct": 0.148,
"llama-3.1-70b-instruct": 0.16,
"llama-3.2-3b-instruct": 0.084,
"llama-3.3-70b-instruct": 0.07599999999999998,
"mistral-large-instruct-2411": 0.264,
"gemma-2-27b-it": 0.14400000000000002,
"gemma-2-9b-it": 0.092,
"deepseek-v3": 0.512,
"deepseek-r1": 0.5,
"qwq-32b": 0.548,
"Average": 0.24800000000000003
},
"HistoricalEnv": {
"qwen2.5-3b-instruct": 0.24,
"qwen2.5-7b-instruct": 0.368,
"qwen2.5-14b-instruct": 0.5800000000000001,
"qwen2.5-32b-instruct": 0.476,
"qwen2.5-72b-instruct": 0.512,
"llama-3.1-8b-instruct": 0.332,
"llama-3.1-70b-instruct": 0.616,
"llama-3.2-3b-instruct": 0.2,
"llama-3.3-70b-instruct": 0.652,
"mistral-large-instruct-2411": 0.6880000000000001,
"gemma-2-27b-it": 0.5,
"gemma-2-9b-it": 0.376,
"deepseek-v3": 0.748,
"deepseek-r1": 0.828,
"qwq-32b": 0.884,
"Average": 0.5333333333333334
},
"GadgetEnv": {
"qwen2.5-3b-instruct": 0.124,
"qwen2.5-7b-instruct": 0.312,
"qwen2.5-14b-instruct": 0.852,
"qwen2.5-32b-instruct": 0.8640000000000001,
"qwen2.5-72b-instruct": 0.892,
"llama-3.1-8b-instruct": 0.284,
"llama-3.1-70b-instruct": 0.692,
"llama-3.2-3b-instruct": 0.11200000000000002,
"llama-3.3-70b-instruct": 0.7360000000000001,
"mistral-large-instruct-2411": 0.884,
"gemma-2-27b-it": 0.32799999999999996,
"gemma-2-9b-it": 0.184,
"deepseek-v3": 0.9640000000000001,
"deepseek-r1": 0.932,
"qwq-32b": 0.932,
"Average": 0.6061333333333334
},
"TimeTravelEnv": {
"qwen2.5-3b-instruct": 0.128,
"qwen2.5-7b-instruct": 0.292,
"qwen2.5-14b-instruct": 0.808,
"qwen2.5-32b-instruct": 0.828,
"qwen2.5-72b-instruct": 0.8039999999999999,
"llama-3.1-8b-instruct": 0.376,
"llama-3.1-70b-instruct": 0.684,
"llama-3.2-3b-instruct": 0.124,
"llama-3.3-70b-instruct": 0.716,
"mistral-large-instruct-2411": 0.884,
"gemma-2-27b-it": 0.32799999999999996,
"gemma-2-9b-it": 0.21600000000000003,
"deepseek-v3": 0.9399999999999998,
"deepseek-r1": 0.932,
"qwq-32b": 0.924,
"Average": 0.5989333333333333
},
"PollutionEnv": {
"qwen2.5-3b-instruct": 0.136,
"qwen2.5-7b-instruct": 0.328,
"qwen2.5-14b-instruct": 0.792,
"qwen2.5-32b-instruct": 0.7120000000000001,
"qwen2.5-72b-instruct": 0.704,
"llama-3.1-8b-instruct": 0.316,
"llama-3.1-70b-instruct": 0.664,
"llama-3.2-3b-instruct": 0.124,
"llama-3.3-70b-instruct": 0.6960000000000001,
"mistral-large-instruct-2411": 0.784,
"gemma-2-27b-it": 0.336,
"gemma-2-9b-it": 0.252,
"deepseek-v3": 0.8640000000000001,
"deepseek-r1": 0.8560000000000001,
"qwq-32b": 0.852,
"Average": 0.5610666666666666
},
"DemographicEnv": {
"qwen2.5-3b-instruct": 0.072,
"qwen2.5-7b-instruct": 0.42800000000000005,
"qwen2.5-14b-instruct": 0.68,
"qwen2.5-32b-instruct": 0.7799999999999999,
"qwen2.5-72b-instruct": 0.7719999999999999,
"llama-3.1-8b-instruct": 0.272,
"llama-3.1-70b-instruct": 0.6239999999999999,
"llama-3.2-3b-instruct": 0.176,
"llama-3.3-70b-instruct": 0.748,
"mistral-large-instruct-2411": 0.8200000000000001,
"gemma-2-27b-it": 0.356,
"gemma-2-9b-it": 0.156,
"deepseek-v3": 0.8960000000000001,
"deepseek-r1": 0.876,
"qwq-32b": 0.8960000000000001,
"Average": 0.5701333333333333
},
"GeneticEnv": {
"qwen2.5-3b-instruct": 0.084,
"qwen2.5-7b-instruct": 0.392,
"qwen2.5-14b-instruct": 0.884,
"qwen2.5-32b-instruct": 0.9279999999999999,
"qwen2.5-72b-instruct": 0.9400000000000001,
"llama-3.1-8b-instruct": 0.45999999999999996,
"llama-3.1-70b-instruct": 0.9,
"llama-3.2-3b-instruct": 0.192,
"llama-3.3-70b-instruct": 0.916,
"mistral-large-instruct-2411": 0.9040000000000001,
"gemma-2-27b-it": 0.776,
"gemma-2-9b-it": 0.548,
"deepseek-v3": 0.984,
"deepseek-r1": 0.952,
"qwq-32b": 0.932,
"Average": 0.7194666666666667
},
"CraftsmanEnv": {
"qwen2.5-3b-instruct": 0.14400000000000002,
"qwen2.5-7b-instruct": 0.256,
"qwen2.5-14b-instruct": 0.624,
"qwen2.5-32b-instruct": 0.736,
"qwen2.5-72b-instruct": 0.664,
"llama-3.1-8b-instruct": 0.22000000000000003,
"llama-3.1-70b-instruct": 0.524,
"llama-3.2-3b-instruct": 0.10800000000000001,
"llama-3.3-70b-instruct": 0.41600000000000004,
"mistral-large-instruct-2411": 0.7080000000000001,
"gemma-2-27b-it": 0.324,
"gemma-2-9b-it": 0.096,
"deepseek-v3": 0.9,
"deepseek-r1": 0.7879999999999999,
"qwq-32b": 0.8160000000000001,
"Average": 0.4882666666666667
},
"StarConstellationEnv": {
"qwen2.5-3b-instruct": 0.1,
"qwen2.5-7b-instruct": 0.332,
"qwen2.5-14b-instruct": 0.5960000000000001,
"qwen2.5-32b-instruct": 0.572,
"qwen2.5-72b-instruct": 0.5840000000000001,
"llama-3.1-8b-instruct": 0.376,
"llama-3.1-70b-instruct": 0.4640000000000001,
"llama-3.2-3b-instruct": 0.136,
"llama-3.3-70b-instruct": 0.41200000000000003,
"mistral-large-instruct-2411": 0.6120000000000001,
"gemma-2-27b-it": 0.472,
"gemma-2-9b-it": 0.22799999999999998,
"deepseek-v3": 0.744,
"deepseek-r1": 0.748,
"qwq-32b": 0.736,
"Average": 0.47413333333333335
},
"MythicalCreatureEnv": {
"qwen2.5-3b-instruct": 0.2,
"qwen2.5-7b-instruct": 0.324,
"qwen2.5-14b-instruct": 0.632,
"qwen2.5-32b-instruct": 0.712,
"qwen2.5-72b-instruct": 0.668,
"llama-3.1-8b-instruct": 0.31200000000000006,
"llama-3.1-70b-instruct": 0.62,
"llama-3.2-3b-instruct": 0.11200000000000002,
"llama-3.3-70b-instruct": 0.648,
"mistral-large-instruct-2411": 0.7480000000000001,
"gemma-2-27b-it": 0.42799999999999994,
"gemma-2-9b-it": 0.268,
"deepseek-v3": 0.8400000000000001,
"deepseek-r1": 0.8400000000000001,
"qwq-32b": 0.852,
"Average": 0.5469333333333333
},
"ArtStyleEnv": {
"qwen2.5-3b-instruct": 0.136,
"qwen2.5-7b-instruct": 0.332,
"qwen2.5-14b-instruct": 0.78,
"qwen2.5-32b-instruct": 0.8320000000000001,
"qwen2.5-72b-instruct": 0.748,
"llama-3.1-8b-instruct": 0.356,
"llama-3.1-70b-instruct": 0.616,
"llama-3.2-3b-instruct": 0.17200000000000001,
"llama-3.3-70b-instruct": 0.6199999999999999,
"mistral-large-instruct-2411": 0.828,
"gemma-2-27b-it": 0.43200000000000005,
"gemma-2-9b-it": 0.256,
"deepseek-v3": 0.876,
"deepseek-r1": 0.8200000000000001,
"qwq-32b": 0.868,
"Average": 0.5781333333333335
},
"CookingEnv": {
"qwen2.5-3b-instruct": 0.13999999999999999,
"qwen2.5-7b-instruct": 0.44799999999999995,
"qwen2.5-14b-instruct": 0.76,
"qwen2.5-32b-instruct": 0.7440000000000001,
"qwen2.5-72b-instruct": 0.7,
"llama-3.1-8b-instruct": 0.364,
"llama-3.1-70b-instruct": 0.6839999999999999,
"llama-3.2-3b-instruct": 0.156,
"llama-3.3-70b-instruct": 0.656,
"mistral-large-instruct-2411": 0.74,
"gemma-2-27b-it": 0.48,
"gemma-2-9b-it": 0.364,
"deepseek-v3": 0.8640000000000001,
"deepseek-r1": 0.812,
"qwq-32b": 0.9,
"Average": 0.5874666666666666
},
"HistoricalBattleEnv": {
"qwen2.5-3b-instruct": 0.256,
"qwen2.5-7b-instruct": 0.292,
"qwen2.5-14b-instruct": 0.45999999999999996,
"qwen2.5-32b-instruct": 0.476,
"qwen2.5-72b-instruct": 0.42400000000000004,
"llama-3.1-8b-instruct": 0.28400000000000003,
"llama-3.1-70b-instruct": 0.492,
"llama-3.2-3b-instruct": 0.148,
"llama-3.3-70b-instruct": 0.62,
"mistral-large-instruct-2411": 0.608,
"gemma-2-27b-it": 0.388,
"gemma-2-9b-it": 0.34,
"deepseek-v3": 0.724,
"deepseek-r1": 0.788,
"qwq-32b": 0.8560000000000001,
"Average": 0.47706666666666664
},
"FungalEnv": {
"qwen2.5-3b-instruct": 0.15999999999999998,
"qwen2.5-7b-instruct": 0.46399999999999997,
"qwen2.5-14b-instruct": 0.664,
"qwen2.5-32b-instruct": 0.728,
"qwen2.5-72b-instruct": 0.6839999999999999,
"llama-3.1-8b-instruct": 0.41600000000000004,
"llama-3.1-70b-instruct": 0.5840000000000001,
"llama-3.2-3b-instruct": 0.14,
"llama-3.3-70b-instruct": 0.644,
"mistral-large-instruct-2411": 0.7440000000000001,
"gemma-2-27b-it": 0.536,
"gemma-2-9b-it": 0.184,
"deepseek-v3": 0.844,
"deepseek-r1": 0.764,
"qwq-32b": 0.7879999999999999,
"Average": 0.5562666666666666
},
"CryptographyEnv": {
"qwen2.5-3b-instruct": 0.24000000000000005,
"qwen2.5-7b-instruct": 0.23199999999999998,
"qwen2.5-14b-instruct": 0.508,
"qwen2.5-32b-instruct": 0.5760000000000001,
"qwen2.5-72b-instruct": 0.528,
"llama-3.1-8b-instruct": 0.29600000000000004,
"llama-3.1-70b-instruct": 0.524,
"llama-3.2-3b-instruct": 0.11600000000000002,
"llama-3.3-70b-instruct": 0.512,
"mistral-large-instruct-2411": 0.6799999999999999,
"gemma-2-27b-it": 0.328,
"gemma-2-9b-it": 0.192,
"deepseek-v3": 0.784,
"deepseek-r1": 0.74,
"qwq-32b": 0.8480000000000001,
"Average": 0.4736
},
"StorageEnv": {
"qwen2.5-3b-instruct": 0.22800000000000004,
"qwen2.5-7b-instruct": 0.44000000000000006,
"qwen2.5-14b-instruct": 0.852,
"qwen2.5-32b-instruct": 0.884,
"qwen2.5-72b-instruct": 0.8119999999999999,
"llama-3.1-8b-instruct": 0.34800000000000003,
"llama-3.1-70b-instruct": 0.724,
"llama-3.2-3b-instruct": 0.21600000000000003,
"llama-3.3-70b-instruct": 0.796,
"mistral-large-instruct-2411": 0.8880000000000001,
"gemma-2-27b-it": 0.596,
"gemma-2-9b-it": 0.392,
"deepseek-v3": 0.9640000000000001,
"deepseek-r1": 0.9119999999999999,
"qwq-32b": 0.944,
"Average": 0.6663999999999999
},
"RoverEnv": {
"qwen2.5-3b-instruct": 0.14400000000000002,
"qwen2.5-7b-instruct": 0.236,
"qwen2.5-14b-instruct": 0.8480000000000001,
"qwen2.5-32b-instruct": 0.8360000000000001,
"qwen2.5-72b-instruct": 0.796,
"llama-3.1-8b-instruct": 0.28400000000000003,
"llama-3.1-70b-instruct": 0.612,
"llama-3.2-3b-instruct": 0.148,
"llama-3.3-70b-instruct": 0.724,
"mistral-large-instruct-2411": 0.828,
"gemma-2-27b-it": 0.4600000000000001,
"gemma-2-9b-it": 0.072,
"deepseek-v3": 0.9200000000000002,
"deepseek-r1": 0.9,
"qwq-32b": 0.8720000000000001,
"Average": 0.5786666666666668
},
"FashionEnv": {
"qwen2.5-3b-instruct": 0.17200000000000001,
"qwen2.5-7b-instruct": 0.304,
"qwen2.5-14b-instruct": 0.8240000000000001,
"qwen2.5-32b-instruct": 0.808,
"qwen2.5-72b-instruct": 0.768,
"llama-3.1-8b-instruct": 0.32,
"llama-3.1-70b-instruct": 0.6,
"llama-3.2-3b-instruct": 0.16399999999999998,
"llama-3.3-70b-instruct": 0.6160000000000001,
"mistral-large-instruct-2411": 0.756,
"gemma-2-27b-it": 0.524,
"gemma-2-9b-it": 0.292,
"deepseek-v3": 0.86,
"deepseek-r1": 0.756,
"qwq-32b": 0.86,
"Average": 0.5749333333333334
},
"LicenseEnv": {
"qwen2.5-3b-instruct": 0.196,
"qwen2.5-7b-instruct": 0.29200000000000004,
"qwen2.5-14b-instruct": 0.556,
"qwen2.5-32b-instruct": 0.44000000000000006,
"qwen2.5-72b-instruct": 0.484,
"llama-3.1-8b-instruct": 0.26,
"llama-3.1-70b-instruct": 0.496,
"llama-3.2-3b-instruct": 0.072,
"llama-3.3-70b-instruct": 0.45999999999999996,
"mistral-large-instruct-2411": 0.504,
"gemma-2-27b-it": 0.37600000000000006,
"gemma-2-9b-it": 0.296,
"deepseek-v3": 0.556,
"deepseek-r1": 0.52,
"qwq-32b": 0.5800000000000001,
"Average": 0.4058666666666667
},
"VirusClassificationEnv": {
"qwen2.5-3b-instruct": 0.22000000000000003,
"qwen2.5-7b-instruct": 0.28,
"qwen2.5-14b-instruct": 0.384,
"qwen2.5-32b-instruct": 0.38,
"qwen2.5-72b-instruct": 0.42800000000000005,
"llama-3.1-8b-instruct": 0.256,
"llama-3.1-70b-instruct": 0.332,
"llama-3.2-3b-instruct": 0.156,
"llama-3.3-70b-instruct": 0.396,
"mistral-large-instruct-2411": 0.532,
"gemma-2-27b-it": 0.34,
"gemma-2-9b-it": 0.31200000000000006,
"deepseek-v3": 0.536,
"deepseek-r1": 0.64,
"qwq-32b": 0.684,
"Average": 0.3917333333333333
},
"TestingEnv": {
"qwen2.5-3b-instruct": 0.19200000000000003,
"qwen2.5-7b-instruct": 0.22000000000000003,
"qwen2.5-14b-instruct": 0.608,
"qwen2.5-32b-instruct": 0.648,
"qwen2.5-72b-instruct": 0.708,
"llama-3.1-8b-instruct": 0.332,
"llama-3.1-70b-instruct": 0.68,
"llama-3.2-3b-instruct": 0.17200000000000001,
"llama-3.3-70b-instruct": 0.7040000000000001,
"mistral-large-instruct-2411": 0.764,
"gemma-2-27b-it": 0.22799999999999998,
"gemma-2-9b-it": 0.26,
"deepseek-v3": 0.8880000000000001,
"deepseek-r1": 0.764,
"qwq-32b": 0.7999999999999999,
"Average": 0.5312
},
"NarrativeDetectEnv": {
"qwen2.5-3b-instruct": 0.148,
"qwen2.5-7b-instruct": 0.30000000000000004,
"qwen2.5-14b-instruct": 0.552,
"qwen2.5-32b-instruct": 0.8440000000000001,
"qwen2.5-72b-instruct": 0.76,
"llama-3.1-8b-instruct": 0.28800000000000003,
"llama-3.1-70b-instruct": 0.6279999999999999,
"llama-3.2-3b-instruct": 0.10400000000000001,
"llama-3.3-70b-instruct": 0.704,
"mistral-large-instruct-2411": 0.7919999999999999,
"gemma-2-27b-it": 0.328,
"gemma-2-9b-it": 0.192,
"deepseek-v3": 0.8560000000000001,
"deepseek-r1": 0.748,
"qwq-32b": 0.784,
"Average": 0.5352
},
"RenewableEnergyEnv": {
"qwen2.5-3b-instruct": 0.184,
"qwen2.5-7b-instruct": 0.44399999999999995,
"qwen2.5-14b-instruct": 0.648,
"qwen2.5-32b-instruct": 0.932,
"qwen2.5-72b-instruct": 0.8880000000000001,
"llama-3.1-8b-instruct": 0.396,
"llama-3.1-70b-instruct": 0.812,
"llama-3.2-3b-instruct": 0.2,
"llama-3.3-70b-instruct": 0.8240000000000001,
"mistral-large-instruct-2411": 0.8560000000000001,
"gemma-2-27b-it": 0.348,
"gemma-2-9b-it": 0.188,
"deepseek-v3": 0.96,
"deepseek-r1": 0.9800000000000001,
"qwq-32b": 0.9800000000000001,
"Average": 0.6426666666666667
},
"CelestialEnv": {
"qwen2.5-3b-instruct": 0.20400000000000001,
"qwen2.5-7b-instruct": 0.252,
"qwen2.5-14b-instruct": 0.728,
"qwen2.5-32b-instruct": 0.792,
"qwen2.5-72b-instruct": 0.7239999999999999,
"llama-3.1-8b-instruct": 0.256,
"llama-3.1-70b-instruct": 0.6920000000000001,
"llama-3.2-3b-instruct": 0.192,
"llama-3.3-70b-instruct": 0.744,
"mistral-large-instruct-2411": 0.82,
"gemma-2-27b-it": 0.528,
"gemma-2-9b-it": 0.344,
"deepseek-v3": 0.8480000000000001,
"deepseek-r1": 0.8360000000000001,
"qwq-32b": 0.8879999999999999,
"Average": 0.5898666666666668
},
"SpiceEnv": {
"qwen2.5-3b-instruct": 0.21199999999999997,
"qwen2.5-7b-instruct": 0.332,
"qwen2.5-14b-instruct": 0.672,
"qwen2.5-32b-instruct": 0.476,
"qwen2.5-72b-instruct": 0.5880000000000001,
"llama-3.1-8b-instruct": 0.32799999999999996,
"llama-3.1-70b-instruct": 0.40800000000000003,
"llama-3.2-3b-instruct": 0.22000000000000003,
"llama-3.3-70b-instruct": 0.336,
"mistral-large-instruct-2411": 0.5800000000000001,
"gemma-2-27b-it": 0.28400000000000003,
"gemma-2-9b-it": 0.172,
"deepseek-v3": 0.908,
"deepseek-r1": 0.7679999999999999,
"qwq-32b": 0.8720000000000001,
"Average": 0.47706666666666664
},
"WildlifeEnv": {
"qwen2.5-3b-instruct": 0.21600000000000003,
"qwen2.5-7b-instruct": 0.352,
"qwen2.5-14b-instruct": 0.644,
"qwen2.5-32b-instruct": 0.592,
"qwen2.5-72b-instruct": 0.616,
"llama-3.1-8b-instruct": 0.316,
"llama-3.1-70b-instruct": 0.544,
"llama-3.2-3b-instruct": 0.23199999999999998,
"llama-3.3-70b-instruct": 0.616,
"mistral-large-instruct-2411": 0.628,
"gemma-2-27b-it": 0.45199999999999996,
"gemma-2-9b-it": 0.344,
"deepseek-v3": 0.736,
"deepseek-r1": 0.6040000000000001,
"qwq-32b": 0.716,
"Average": 0.5072
},
"VehicleEnv": {
"qwen2.5-3b-instruct": 0.172,
"qwen2.5-7b-instruct": 0.308,
"qwen2.5-14b-instruct": 0.54,
"qwen2.5-32b-instruct": 0.776,
"qwen2.5-72b-instruct": 0.78,
"llama-3.1-8b-instruct": 0.248,
"llama-3.1-70b-instruct": 0.62,
"llama-3.2-3b-instruct": 0.152,
"llama-3.3-70b-instruct": 0.6960000000000001,
"mistral-large-instruct-2411": 0.8800000000000001,
"gemma-2-27b-it": 0.44799999999999995,
"gemma-2-9b-it": 0.248,
"deepseek-v3": 0.9199999999999999,
"deepseek-r1": 0.9199999999999999,
"qwq-32b": 0.916,
"Average": 0.5749333333333333
},
"BeverageEnv": {
"qwen2.5-3b-instruct": 0.128,
"qwen2.5-7b-instruct": 0.296,
"qwen2.5-14b-instruct": 0.792,
"qwen2.5-32b-instruct": 0.6880000000000001,
"qwen2.5-72b-instruct": 0.724,
"llama-3.1-8b-instruct": 0.41200000000000003,
"llama-3.1-70b-instruct": 0.6199999999999999,
"llama-3.2-3b-instruct": 0.16399999999999998,
"llama-3.3-70b-instruct": 0.5800000000000001,
"mistral-large-instruct-2411": 0.748,
"gemma-2-27b-it": 0.40800000000000003,
"gemma-2-9b-it": 0.296,
"deepseek-v3": 0.8800000000000001,
"deepseek-r1": 0.7520000000000001,
"qwq-32b": 0.844,
"Average": 0.5554666666666667
},
"ControlEnv": {
"qwen2.5-3b-instruct": 0.12800000000000003,
"qwen2.5-7b-instruct": 0.364,
"qwen2.5-14b-instruct": 0.68,
"qwen2.5-32b-instruct": 0.8320000000000001,
"qwen2.5-72b-instruct": 0.8400000000000001,
"llama-3.1-8b-instruct": 0.364,
"llama-3.1-70b-instruct": 0.656,
"llama-3.2-3b-instruct": 0.15599999999999997,
"llama-3.3-70b-instruct": 0.6320000000000001,
"mistral-large-instruct-2411": 0.784,
"gemma-2-27b-it": 0.4640000000000001,
"gemma-2-9b-it": 0.18,
"deepseek-v3": 0.9119999999999999,
"deepseek-r1": 0.9119999999999999,
"qwq-32b": 0.932,
"Average": 0.5890666666666665
},
"CurrencyEnv": {
"qwen2.5-3b-instruct": 0.252,
"qwen2.5-7b-instruct": 0.392,
"qwen2.5-14b-instruct": 0.8560000000000001,
"qwen2.5-32b-instruct": 0.884,
"qwen2.5-72b-instruct": 0.836,
"llama-3.1-8b-instruct": 0.476,
"llama-3.1-70b-instruct": 0.7520000000000001,
"llama-3.2-3b-instruct": 0.22400000000000003,
"llama-3.3-70b-instruct": 0.7000000000000001,
"mistral-large-instruct-2411": 0.8960000000000001,
"gemma-2-27b-it": 0.68,
"gemma-2-9b-it": 0.196,
"deepseek-v3": 0.9800000000000001,
"deepseek-r1": 0.932,
"qwq-32b": 0.9640000000000001,
"Average": 0.668
},
"MarketingEnv": {
"qwen2.5-3b-instruct": 0.12,
"qwen2.5-7b-instruct": 0.34400000000000003,
"qwen2.5-14b-instruct": 0.524,
"qwen2.5-32b-instruct": 0.7479999999999999,
"qwen2.5-72b-instruct": 0.732,
"llama-3.1-8b-instruct": 0.30800000000000005,
"llama-3.1-70b-instruct": 0.7040000000000001,
"llama-3.2-3b-instruct": 0.14400000000000002,
"llama-3.3-70b-instruct": 0.7639999999999999,
"mistral-large-instruct-2411": 0.7600000000000001,
"gemma-2-27b-it": 0.32400000000000007,
"gemma-2-9b-it": 0.184,
"deepseek-v3": 0.812,
"deepseek-r1": 0.7959999999999999,
"qwq-32b": 0.8320000000000001,
"Average": 0.5397333333333333
},
"BotanicalEnv": {
"qwen2.5-3b-instruct": 0.18800000000000003,
"qwen2.5-7b-instruct": 0.316,
"qwen2.5-14b-instruct": 0.9119999999999999,
"qwen2.5-32b-instruct": 0.884,
"qwen2.5-72b-instruct": 0.9039999999999999,
"llama-3.1-8b-instruct": 0.4119999999999999,
"llama-3.1-70b-instruct": 0.836,
"llama-3.2-3b-instruct": 0.23600000000000004,
"llama-3.3-70b-instruct": 0.8480000000000001,
"mistral-large-instruct-2411": 0.8640000000000001,
"gemma-2-27b-it": 0.604,
"gemma-2-9b-it": 0.264,
"deepseek-v3": 0.9040000000000001,
"deepseek-r1": 0.9399999999999998,
"qwq-32b": 0.968,
"Average": 0.672
},
"CircusActEnv": {
"qwen2.5-3b-instruct": 0.17200000000000001,
"qwen2.5-7b-instruct": 0.32399999999999995,
"qwen2.5-14b-instruct": 0.64,
"qwen2.5-32b-instruct": 0.712,
"qwen2.5-72b-instruct": 0.768,
"llama-3.1-8b-instruct": 0.276,
"llama-3.1-70b-instruct": 0.648,
"llama-3.2-3b-instruct": 0.176,
"llama-3.3-70b-instruct": 0.62,
"mistral-large-instruct-2411": 0.748,
"gemma-2-27b-it": 0.384,
"gemma-2-9b-it": 0.29600000000000004,
"deepseek-v3": 0.8640000000000001,
"deepseek-r1": 0.82,
"qwq-32b": 0.8720000000000001,
"Average": 0.5546666666666668
},
"AudioDialectEnv": {
"qwen2.5-3b-instruct": 0.128,
"qwen2.5-7b-instruct": 0.312,
"qwen2.5-14b-instruct": 0.5800000000000001,
"qwen2.5-32b-instruct": 0.6,
"qwen2.5-72b-instruct": 0.528,
"llama-3.1-8b-instruct": 0.21600000000000003,
"llama-3.1-70b-instruct": 0.4,
"llama-3.2-3b-instruct": 0.132,
"llama-3.3-70b-instruct": 0.32399999999999995,
"mistral-large-instruct-2411": 0.68,
"gemma-2-27b-it": 0.28,
"gemma-2-9b-it": 0.11600000000000002,
"deepseek-v3": 0.7520000000000001,
"deepseek-r1": 0.7919999999999999,
"qwq-32b": 0.8119999999999999,
"Average": 0.4434666666666666
},
"LeadershipEnv": {
"qwen2.5-3b-instruct": 0.164,
"qwen2.5-7b-instruct": 0.372,
"qwen2.5-14b-instruct": 0.7,
"qwen2.5-32b-instruct": 0.732,
"qwen2.5-72b-instruct": 0.7639999999999999,
"llama-3.1-8b-instruct": 0.364,
"llama-3.1-70b-instruct": 0.708,
"llama-3.2-3b-instruct": 0.128,
"llama-3.3-70b-instruct": 0.6920000000000001,
"mistral-large-instruct-2411": 0.728,
"gemma-2-27b-it": 0.46799999999999997,
"gemma-2-9b-it": 0.20400000000000001,
"deepseek-v3": 0.8200000000000001,
"deepseek-r1": 0.748,
"qwq-32b": 0.828,
"Average": 0.5613333333333334
},
"TransportEnv": {
"qwen2.5-3b-instruct": 0.196,
"qwen2.5-7b-instruct": 0.372,
"qwen2.5-14b-instruct": 0.716,
"qwen2.5-32b-instruct": 0.732,
"qwen2.5-72b-instruct": 0.8,
"llama-3.1-8b-instruct": 0.316,
"llama-3.1-70b-instruct": 0.648,
"llama-3.2-3b-instruct": 0.15200000000000002,
"llama-3.3-70b-instruct": 0.6000000000000001,
"mistral-large-instruct-2411": 0.7879999999999999,
"gemma-2-27b-it": 0.44399999999999995,
"gemma-2-9b-it": 0.364,
"deepseek-v3": 0.8640000000000001,
"deepseek-r1": 0.8240000000000001,
"qwq-32b": 0.9199999999999999,
"Average": 0.5824
},
"EcologicalEnv": {
"qwen2.5-3b-instruct": 0.152,
"qwen2.5-7b-instruct": 0.45600000000000007,
"qwen2.5-14b-instruct": 0.748,
"qwen2.5-32b-instruct": 0.82,
"qwen2.5-72b-instruct": 0.792,
"llama-3.1-8b-instruct": 0.42000000000000004,
"llama-3.1-70b-instruct": 0.692,
"llama-3.2-3b-instruct": 0.21600000000000003,
"llama-3.3-70b-instruct": 0.64,
"mistral-large-instruct-2411": 0.772,
"gemma-2-27b-it": 0.5680000000000001,
"gemma-2-9b-it": 0.46799999999999997,
"deepseek-v3": 0.868,
"deepseek-r1": 0.8720000000000001,
"qwq-32b": 0.8879999999999999,
"Average": 0.6248
},
"MythicEnv": {
"qwen2.5-3b-instruct": 0.132,
"qwen2.5-7b-instruct": 0.36,
"qwen2.5-14b-instruct": 0.744,
"qwen2.5-32b-instruct": 0.74,
"qwen2.5-72b-instruct": 0.672,
"llama-3.1-8b-instruct": 0.236,
"llama-3.1-70b-instruct": 0.596,
"llama-3.2-3b-instruct": 0.12,
"llama-3.3-70b-instruct": 0.576,
"mistral-large-instruct-2411": 0.6960000000000001,
"gemma-2-27b-it": 0.45599999999999996,
"gemma-2-9b-it": 0.136,
"deepseek-v3": 0.8960000000000001,
"deepseek-r1": 0.8720000000000001,
"qwq-32b": 0.8400000000000001,
"Average": 0.5381333333333332
},
"EnzymeEnv": {
"qwen2.5-3b-instruct": 0.252,
"qwen2.5-7b-instruct": 0.43200000000000005,
"qwen2.5-14b-instruct": 0.636,
"qwen2.5-32b-instruct": 0.676,
"qwen2.5-72b-instruct": 0.676,
"llama-3.1-8b-instruct": 0.316,
"llama-3.1-70b-instruct": 0.552,
"llama-3.2-3b-instruct": 0.192,
"llama-3.3-70b-instruct": 0.5640000000000001,
"mistral-large-instruct-2411": 0.732,
"gemma-2-27b-it": 0.43600000000000005,
"gemma-2-9b-it": 0.264,
"deepseek-v3": 0.8400000000000001,
"deepseek-r1": 0.76,
"qwq-32b": 0.804,
"Average": 0.5421333333333334
},
"OSKernelEnv": {
"qwen2.5-3b-instruct": 0.192,
"qwen2.5-7b-instruct": 0.28400000000000003,
"qwen2.5-14b-instruct": 0.8119999999999999,
"qwen2.5-32b-instruct": 0.784,
"qwen2.5-72b-instruct": 0.788,
"llama-3.1-8b-instruct": 0.316,
"llama-3.1-70b-instruct": 0.6920000000000001,
"llama-3.2-3b-instruct": 0.128,
"llama-3.3-70b-instruct": 0.74,
"mistral-large-instruct-2411": 0.8559999999999999,
"gemma-2-27b-it": 0.46399999999999997,
"gemma-2-9b-it": 0.2,
"deepseek-v3": 0.9480000000000001,
"deepseek-r1": 0.96,
"qwq-32b": 0.984,
"Average": 0.6098666666666668
},
"MineralClassificationEnv": {
"qwen2.5-3b-instruct": 0.11600000000000002,
"qwen2.5-7b-instruct": 0.248,
"qwen2.5-14b-instruct": 0.8320000000000001,
"qwen2.5-32b-instruct": 0.9040000000000001,
"qwen2.5-72b-instruct": 0.884,
"llama-3.1-8b-instruct": 0.384,
"llama-3.1-70b-instruct": 0.8240000000000001,
"llama-3.2-3b-instruct": 0.14800000000000002,
"llama-3.3-70b-instruct": 0.8960000000000001,
"mistral-large-instruct-2411": 0.908,
"gemma-2-27b-it": 0.508,
"gemma-2-9b-it": 0.268,
"deepseek-v3": 0.984,
"deepseek-r1": 0.9199999999999999,
"qwq-32b": 0.9640000000000001,
"Average": 0.6525333333333333
},
"EconomicEnv": {
"qwen2.5-3b-instruct": 0.136,
"qwen2.5-7b-instruct": 0.24,
"qwen2.5-14b-instruct": 0.8560000000000001,
"qwen2.5-32b-instruct": 0.9199999999999999,
"qwen2.5-72b-instruct": 0.8960000000000001,
"llama-3.1-8b-instruct": 0.43600000000000005,
"llama-3.1-70b-instruct": 0.808,
"llama-3.2-3b-instruct": 0.152,
"llama-3.3-70b-instruct": 0.8240000000000001,
"mistral-large-instruct-2411": 0.924,
"gemma-2-27b-it": 0.45199999999999996,
"gemma-2-9b-it": 0.36,
"deepseek-v3": 0.9559999999999998,
"deepseek-r1": 0.9359999999999999,
"qwq-32b": 0.9719999999999999,
"Average": 0.6578666666666667
},
"DetectiveEnv": {
"qwen2.5-3b-instruct": 0.168,
"qwen2.5-7b-instruct": 0.38,
"qwen2.5-14b-instruct": 0.836,
"qwen2.5-32b-instruct": 0.884,
"qwen2.5-72b-instruct": 0.8480000000000001,
"llama-3.1-8b-instruct": 0.34800000000000003,
"llama-3.1-70b-instruct": 0.74,
"llama-3.2-3b-instruct": 0.248,
"llama-3.3-70b-instruct": 0.792,
"mistral-large-instruct-2411": 0.8960000000000001,
"gemma-2-27b-it": 0.512,
"gemma-2-9b-it": 0.33199999999999996,
"deepseek-v3": 0.976,
"deepseek-r1": 0.9640000000000001,
"qwq-32b": 0.984,
"Average": 0.6605333333333333
},
"ChessEnv": {
"qwen2.5-3b-instruct": 0.184,
"qwen2.5-7b-instruct": 0.27999999999999997,
"qwen2.5-14b-instruct": 0.592,
"qwen2.5-32b-instruct": 0.616,
"qwen2.5-72b-instruct": 0.5720000000000001,
"llama-3.1-8b-instruct": 0.188,
"llama-3.1-70b-instruct": 0.6639999999999999,
"llama-3.2-3b-instruct": 0.084,
"llama-3.3-70b-instruct": 0.6280000000000001,
"mistral-large-instruct-2411": 0.744,
"gemma-2-27b-it": 0.30000000000000004,
"gemma-2-9b-it": 0.096,
"deepseek-v3": 0.696,
"deepseek-r1": 0.6519999999999999,
"qwq-32b": 0.664,
"Average": 0.4639999999999999
},
"MythicalEnv": {
"qwen2.5-3b-instruct": 0.2,
"qwen2.5-7b-instruct": 0.336,
"qwen2.5-14b-instruct": 0.8039999999999999,
"qwen2.5-32b-instruct": 0.712,
"qwen2.5-72b-instruct": 0.632,
"llama-3.1-8b-instruct": 0.356,
"llama-3.1-70b-instruct": 0.54,
"llama-3.2-3b-instruct": 0.16,
"llama-3.3-70b-instruct": 0.556,
"mistral-large-instruct-2411": 0.728,
"gemma-2-27b-it": 0.54,
"gemma-2-9b-it": 0.404,
"deepseek-v3": 0.9279999999999999,
"deepseek-r1": 0.8959999999999999,
"qwq-32b": 0.876,
"Average": 0.5778666666666666
},
"ChemicalCompoundsEnv": {
"qwen2.5-3b-instruct": 0.18,
"qwen2.5-7b-instruct": 0.252,
"qwen2.5-14b-instruct": 0.40800000000000003,
"qwen2.5-32b-instruct": 0.30000000000000004,
"qwen2.5-72b-instruct": 0.28400000000000003,
"llama-3.1-8b-instruct": 0.148,
"llama-3.1-70b-instruct": 0.28,
"llama-3.2-3b-instruct": 0.14,
"llama-3.3-70b-instruct": 0.18000000000000002,
"mistral-large-instruct-2411": 0.43200000000000005,
"gemma-2-27b-it": 0.23200000000000004,
"gemma-2-9b-it": 0.13599999999999998,
"deepseek-v3": 0.46799999999999997,
"deepseek-r1": 0.624,
"qwq-32b": 0.752,
"Average": 0.32106666666666667
},
"ArchitecturalEnv": {
"qwen2.5-3b-instruct": 0.20400000000000001,
"qwen2.5-7b-instruct": 0.316,
"qwen2.5-14b-instruct": 0.72,
"qwen2.5-32b-instruct": 0.66,
"qwen2.5-72b-instruct": 0.7120000000000001,
"llama-3.1-8b-instruct": 0.256,
"llama-3.1-70b-instruct": 0.556,
"llama-3.2-3b-instruct": 0.132,
"llama-3.3-70b-instruct": 0.508,
"mistral-large-instruct-2411": 0.724,
"gemma-2-27b-it": 0.488,
"gemma-2-9b-it": 0.236,
"deepseek-v3": 0.82,
"deepseek-r1": 0.744,
"qwq-32b": 0.8240000000000001,
"Average": 0.5266666666666666
},
"ComputationEnv": {
"qwen2.5-3b-instruct": 0.152,
"qwen2.5-7b-instruct": 0.248,
"qwen2.5-14b-instruct": 0.76,
"qwen2.5-32b-instruct": 0.884,
"qwen2.5-72b-instruct": 0.8560000000000001,
"llama-3.1-8b-instruct": 0.32799999999999996,
"llama-3.1-70b-instruct": 0.788,
"llama-3.2-3b-instruct": 0.13999999999999999,
"llama-3.3-70b-instruct": 0.8560000000000001,
"mistral-large-instruct-2411": 0.828,
"gemma-2-27b-it": 0.45199999999999996,
"gemma-2-9b-it": 0.252,
"deepseek-v3": 0.96,
"deepseek-r1": 0.9399999999999998,
"qwq-32b": 0.908,
"Average": 0.6234666666666667
},
"MachinePartEnv": {
"qwen2.5-3b-instruct": 0.14,
"qwen2.5-7b-instruct": 0.32,
"qwen2.5-14b-instruct": 0.8240000000000001,
"qwen2.5-32b-instruct": 0.8800000000000001,
"qwen2.5-72b-instruct": 0.828,
"llama-3.1-8b-instruct": 0.376,
"llama-3.1-70b-instruct": 0.8200000000000001,
"llama-3.2-3b-instruct": 0.168,
"llama-3.3-70b-instruct": 0.8960000000000001,
"mistral-large-instruct-2411": 0.876,
"gemma-2-27b-it": 0.508,
"gemma-2-9b-it": 0.268,
"deepseek-v3": 0.9719999999999999,
"deepseek-r1": 0.952,
"qwq-32b": 0.916,
"Average": 0.6496
},
"LiteraryEnv": {
"qwen2.5-3b-instruct": 0.10400000000000001,
"qwen2.5-7b-instruct": 0.328,
"qwen2.5-14b-instruct": 0.8800000000000001,
"qwen2.5-32b-instruct": 0.9279999999999999,
"qwen2.5-72b-instruct": 0.9,
"llama-3.1-8b-instruct": 0.336,
"llama-3.1-70b-instruct": 0.664,
"llama-3.2-3b-instruct": 0.13999999999999999,
"llama-3.3-70b-instruct": 0.664,
"mistral-large-instruct-2411": 0.884,
"gemma-2-27b-it": 0.44399999999999995,
"gemma-2-9b-it": 0.13999999999999999,
"deepseek-v3": 0.984,
"deepseek-r1": 0.9119999999999999,
"qwq-32b": 0.968,
"Average": 0.6184
},
"MarineEnv": {
"qwen2.5-3b-instruct": 0.144,
"qwen2.5-7b-instruct": 0.384,
"qwen2.5-14b-instruct": 0.8720000000000001,
"qwen2.5-32b-instruct": 0.844,
"qwen2.5-72b-instruct": 0.8320000000000001,
"llama-3.1-8b-instruct": 0.308,
"llama-3.1-70b-instruct": 0.636,
"llama-3.2-3b-instruct": 0.12000000000000002,
"llama-3.3-70b-instruct": 0.704,
"mistral-large-instruct-2411": 0.7879999999999999,
"gemma-2-27b-it": 0.484,
"gemma-2-9b-it": 0.23199999999999998,
"deepseek-v3": 0.884,
"deepseek-r1": 0.9,
"qwq-32b": 0.8880000000000001,
"Average": 0.6013333333333334
},
"PhilosophyEnv": {
"qwen2.5-3b-instruct": 0.144,
"qwen2.5-7b-instruct": 0.3,
"qwen2.5-14b-instruct": 0.7280000000000001,
"qwen2.5-32b-instruct": 0.82,
"qwen2.5-72b-instruct": 0.8719999999999999,
"llama-3.1-8b-instruct": 0.32799999999999996,
"llama-3.1-70b-instruct": 0.764,
"llama-3.2-3b-instruct": 0.036000000000000004,
"llama-3.3-70b-instruct": 0.796,
"mistral-large-instruct-2411": 0.7879999999999999,
"gemma-2-27b-it": 0.372,
"gemma-2-9b-it": 0.28,
"deepseek-v3": 0.844,
"deepseek-r1": 0.78,
"qwq-32b": 0.8320000000000001,
"Average": 0.5789333333333334
},
"ArchaeologicalEnv": {
"qwen2.5-3b-instruct": 0.18,
"qwen2.5-7b-instruct": 0.38,
"qwen2.5-14b-instruct": 0.58,
"qwen2.5-32b-instruct": 0.608,
"qwen2.5-72b-instruct": 0.5640000000000001,
"llama-3.1-8b-instruct": 0.26,
"llama-3.1-70b-instruct": 0.608,
"llama-3.2-3b-instruct": 0.192,
"llama-3.3-70b-instruct": 0.548,
"mistral-large-instruct-2411": 0.64,
"gemma-2-27b-it": 0.476,
"gemma-2-9b-it": 0.30000000000000004,
"deepseek-v3": 0.916,
"deepseek-r1": 0.7040000000000001,
"qwq-32b": 0.7559999999999999,
"Average": 0.5141333333333333
},
"GemstoneEnv": {
"qwen2.5-3b-instruct": 0.192,
"qwen2.5-7b-instruct": 0.264,
"qwen2.5-14b-instruct": 0.492,
"qwen2.5-32b-instruct": 0.45599999999999996,
"qwen2.5-72b-instruct": 0.44000000000000006,
"llama-3.1-8b-instruct": 0.192,
"llama-3.1-70b-instruct": 0.40800000000000003,
"llama-3.2-3b-instruct": 0.15200000000000002,
"llama-3.3-70b-instruct": 0.45599999999999996,
"mistral-large-instruct-2411": 0.528,
"gemma-2-27b-it": 0.33999999999999997,
"gemma-2-9b-it": 0.256,
"deepseek-v3": 0.5680000000000001,
"deepseek-r1": 0.5680000000000001,
"qwq-32b": 0.636,
"Average": 0.3965333333333333
},
"MicrobiologyEnv": {
"qwen2.5-3b-instruct": 0.14400000000000002,
"qwen2.5-7b-instruct": 0.38400000000000006,
"qwen2.5-14b-instruct": 0.752,
"qwen2.5-32b-instruct": 0.7,
"qwen2.5-72b-instruct": 0.844,
"llama-3.1-8b-instruct": 0.316,
"llama-3.1-70b-instruct": 0.512,
"llama-3.2-3b-instruct": 0.12000000000000002,
"llama-3.3-70b-instruct": 0.496,
"mistral-large-instruct-2411": 0.764,
"gemma-2-27b-it": 0.504,
"gemma-2-9b-it": 0.172,
"deepseek-v3": 0.9279999999999999,
"deepseek-r1": 0.952,
"qwq-32b": 0.932,
"Average": 0.568
},
"SciFiEnv": {
"qwen2.5-3b-instruct": 0.192,
"qwen2.5-7b-instruct": 0.384,
"qwen2.5-14b-instruct": 0.7879999999999999,
"qwen2.5-32b-instruct": 0.776,
"qwen2.5-72b-instruct": 0.7879999999999999,
"llama-3.1-8b-instruct": 0.35200000000000004,
"llama-3.1-70b-instruct": 0.664,
"llama-3.2-3b-instruct": 0.164,
"llama-3.3-70b-instruct": 0.588,
"mistral-large-instruct-2411": 0.736,
"gemma-2-27b-it": 0.52,
"gemma-2-9b-it": 0.33599999999999997,
"deepseek-v3": 0.9279999999999999,
"deepseek-r1": 0.9199999999999999,
"qwq-32b": 0.9,
"Average": 0.6023999999999999
},
"HormoneEnv": {
"qwen2.5-3b-instruct": 0.152,
"qwen2.5-7b-instruct": 0.40800000000000003,
"qwen2.5-14b-instruct": 0.7999999999999999,
"qwen2.5-32b-instruct": 0.784,
"qwen2.5-72b-instruct": 0.764,
"llama-3.1-8b-instruct": 0.336,
"llama-3.1-70b-instruct": 0.76,
"llama-3.2-3b-instruct": 0.184,
"llama-3.3-70b-instruct": 0.8480000000000001,
"mistral-large-instruct-2411": 0.8,
"gemma-2-27b-it": 0.524,
"gemma-2-9b-it": 0.312,
"deepseek-v3": 0.9480000000000001,
"deepseek-r1": 0.944,
"qwq-32b": 0.852,
"Average": 0.6277333333333334
},
"SculptorEnv": {
"qwen2.5-3b-instruct": 0.23200000000000004,
"qwen2.5-7b-instruct": 0.4159999999999999,
"qwen2.5-14b-instruct": 0.7079999999999999,
"qwen2.5-32b-instruct": 0.636,
"qwen2.5-72b-instruct": 0.6,
"llama-3.1-8b-instruct": 0.22799999999999998,
"llama-3.1-70b-instruct": 0.484,
"llama-3.2-3b-instruct": 0.188,
"llama-3.3-70b-instruct": 0.532,
"mistral-large-instruct-2411": 0.684,
"gemma-2-27b-it": 0.30000000000000004,
"gemma-2-9b-it": 0.156,
"deepseek-v3": 0.788,
"deepseek-r1": 0.7479999999999999,
"qwq-32b": 0.8119999999999999,
"Average": 0.5008
},
"NeuroEnv": {
"qwen2.5-3b-instruct": 0.10800000000000001,
"qwen2.5-7b-instruct": 0.24400000000000005,
"qwen2.5-14b-instruct": 0.8960000000000001,
"qwen2.5-32b-instruct": 0.892,
"qwen2.5-72b-instruct": 0.8879999999999999,
"llama-3.1-8b-instruct": 0.512,
"llama-3.1-70b-instruct": 0.8880000000000001,
"llama-3.2-3b-instruct": 0.20400000000000001,
"llama-3.3-70b-instruct": 0.9279999999999999,
"mistral-large-instruct-2411": 0.8880000000000001,
"gemma-2-27b-it": 0.72,
"gemma-2-9b-it": 0.42800000000000005,
"deepseek-v3": 0.952,
"deepseek-r1": 0.932,
"qwq-32b": 0.852,
"Average": 0.6888000000000001
},
"OceanEnv": {
"qwen2.5-3b-instruct": 0.2,
"qwen2.5-7b-instruct": 0.45999999999999996,
"qwen2.5-14b-instruct": 0.6160000000000001,
"qwen2.5-32b-instruct": 0.6000000000000001,
"qwen2.5-72b-instruct": 0.62,
"llama-3.1-8b-instruct": 0.36400000000000005,
"llama-3.1-70b-instruct": 0.5680000000000001,
"llama-3.2-3b-instruct": 0.156,
"llama-3.3-70b-instruct": 0.476,
"mistral-large-instruct-2411": 0.656,
"gemma-2-27b-it": 0.43200000000000005,
"gemma-2-9b-it": 0.248,
"deepseek-v3": 0.852,
"deepseek-r1": 0.836,
"qwq-32b": 0.8240000000000001,
"Average": 0.5272000000000001
},
"MineralEnv": {
"qwen2.5-3b-instruct": 0.14400000000000002,
"qwen2.5-7b-instruct": 0.38,
"qwen2.5-14b-instruct": 0.768,
"qwen2.5-32b-instruct": 0.6960000000000001,
"qwen2.5-72b-instruct": 0.684,
"llama-3.1-8b-instruct": 0.29600000000000004,
"llama-3.1-70b-instruct": 0.556,
"llama-3.2-3b-instruct": 0.16,
"llama-3.3-70b-instruct": 0.56,
"mistral-large-instruct-2411": 0.66,
"gemma-2-27b-it": 0.384,
"gemma-2-9b-it": 0.17200000000000001,
"deepseek-v3": 0.8480000000000001,
"deepseek-r1": 0.82,
"qwq-32b": 0.8720000000000001,
"Average": 0.5333333333333333
},
"FishEnv": {
"qwen2.5-3b-instruct": 0.188,
"qwen2.5-7b-instruct": 0.38,
"qwen2.5-14b-instruct": 0.732,
"qwen2.5-32b-instruct": 0.668,
"qwen2.5-72b-instruct": 0.7200000000000001,
"llama-3.1-8b-instruct": 0.392,
"llama-3.1-70b-instruct": 0.624,
"llama-3.2-3b-instruct": 0.13599999999999998,
"llama-3.3-70b-instruct": 0.616,
"mistral-large-instruct-2411": 0.736,
"gemma-2-27b-it": 0.508,
"gemma-2-9b-it": 0.268,
"deepseek-v3": 0.86,
"deepseek-r1": 0.868,
"qwq-32b": 0.924,
"Average": 0.5746666666666667
},
"MartialArtsEnv": {
"qwen2.5-3b-instruct": 0.184,
"qwen2.5-7b-instruct": 0.43200000000000005,
"qwen2.5-14b-instruct": 0.672,
"qwen2.5-32b-instruct": 0.5640000000000001,
"qwen2.5-72b-instruct": 0.56,
"llama-3.1-8b-instruct": 0.276,
"llama-3.1-70b-instruct": 0.54,
"llama-3.2-3b-instruct": 0.2,
"llama-3.3-70b-instruct": 0.52,
"mistral-large-instruct-2411": 0.568,
"gemma-2-27b-it": 0.4,
"gemma-2-9b-it": 0.22400000000000003,
"deepseek-v3": 0.784,
"deepseek-r1": 0.716,
"qwq-32b": 0.752,
"Average": 0.4928
},
"RocketFuelEnv": {
"qwen2.5-3b-instruct": 0.22800000000000004,
"qwen2.5-7b-instruct": 0.41600000000000004,
"qwen2.5-14b-instruct": 0.852,
"qwen2.5-32b-instruct": 0.7879999999999999,
"qwen2.5-72b-instruct": 0.8160000000000001,
"llama-3.1-8b-instruct": 0.36,
"llama-3.1-70b-instruct": 0.6799999999999999,
"llama-3.2-3b-instruct": 0.184,
"llama-3.3-70b-instruct": 0.7239999999999999,
"mistral-large-instruct-2411": 0.828,
"gemma-2-27b-it": 0.6279999999999999,
"gemma-2-9b-it": 0.248,
"deepseek-v3": 0.916,
"deepseek-r1": 0.8960000000000001,
"qwq-32b": 0.9040000000000001,
"Average": 0.6312000000000001
},
"MLEnv": {
"qwen2.5-3b-instruct": 0.088,
"qwen2.5-7b-instruct": 0.392,
"qwen2.5-14b-instruct": 0.6,
"qwen2.5-32b-instruct": 0.748,
"qwen2.5-72b-instruct": 0.792,
"llama-3.1-8b-instruct": 0.304,
"llama-3.1-70b-instruct": 0.672,
"llama-3.2-3b-instruct": 0.10799999999999998,
"llama-3.3-70b-instruct": 0.5960000000000001,
"mistral-large-instruct-2411": 0.7639999999999999,
"gemma-2-27b-it": 0.264,
"gemma-2-9b-it": 0.156,
"deepseek-v3": 0.808,
"deepseek-r1": 0.652,
"qwq-32b": 0.772,
"Average": 0.5144
},
"PoliticalManifestoEnv": {
"qwen2.5-3b-instruct": 0.184,
"qwen2.5-7b-instruct": 0.312,
"qwen2.5-14b-instruct": 0.76,
"qwen2.5-32b-instruct": 0.852,
"qwen2.5-72b-instruct": 0.7839999999999999,
"llama-3.1-8b-instruct": 0.42400000000000004,
"llama-3.1-70b-instruct": 0.62,
"llama-3.2-3b-instruct": 0.128,
"llama-3.3-70b-instruct": 0.692,
"mistral-large-instruct-2411": 0.796,
"gemma-2-27b-it": 0.45200000000000007,
"gemma-2-9b-it": 0.152,
"deepseek-v3": 0.86,
"deepseek-r1": 0.792,
"qwq-32b": 0.8800000000000001,
"Average": 0.5792
},
"CoffeeEnv": {
"qwen2.5-3b-instruct": 0.20400000000000001,
"qwen2.5-7b-instruct": 0.38,
"qwen2.5-14b-instruct": 0.7799999999999999,
"qwen2.5-32b-instruct": 0.8039999999999999,
"qwen2.5-72b-instruct": 0.764,
"llama-3.1-8b-instruct": 0.31599999999999995,
"llama-3.1-70b-instruct": 0.552,
"llama-3.2-3b-instruct": 0.17200000000000001,
"llama-3.3-70b-instruct": 0.6599999999999999,
"mistral-large-instruct-2411": 0.828,
"gemma-2-27b-it": 0.592,
"gemma-2-9b-it": 0.364,
"deepseek-v3": 0.9120000000000001,
"deepseek-r1": 0.9279999999999999,
"qwq-32b": 0.9359999999999999,
"Average": 0.6128
},
"MotifAnalysisEnv": {
"qwen2.5-3b-instruct": 0.096,
"qwen2.5-7b-instruct": 0.332,
"qwen2.5-14b-instruct": 0.5680000000000001,
"qwen2.5-32b-instruct": 0.496,
"qwen2.5-72b-instruct": 0.5920000000000001,
"llama-3.1-8b-instruct": 0.244,
"llama-3.1-70b-instruct": 0.36000000000000004,
"llama-3.2-3b-instruct": 0.13999999999999999,
"llama-3.3-70b-instruct": 0.22400000000000003,
"mistral-large-instruct-2411": 0.46399999999999997,
"gemma-2-27b-it": 0.18,
"gemma-2-9b-it": 0.128,
"deepseek-v3": 0.752,
"deepseek-r1": 0.8240000000000001,
"qwq-32b": 0.8640000000000001,
"Average": 0.4176
},
"NutritionEnv": {
"qwen2.5-3b-instruct": 0.132,
"qwen2.5-7b-instruct": 0.22000000000000003,
"qwen2.5-14b-instruct": 0.7920000000000001,
"qwen2.5-32b-instruct": 0.8400000000000001,
"qwen2.5-72b-instruct": 0.876,
"llama-3.1-8b-instruct": 0.264,
"llama-3.1-70b-instruct": 0.64,
"llama-3.2-3b-instruct": 0.128,
"llama-3.3-70b-instruct": 0.7040000000000001,
"mistral-large-instruct-2411": 0.8320000000000001,
"gemma-2-27b-it": 0.38,
"gemma-2-9b-it": 0.20800000000000002,
"deepseek-v3": 0.944,
"deepseek-r1": 0.944,
"qwq-32b": 0.9120000000000001,
"Average": 0.5877333333333333
},
"MalwareEnv": {
"qwen2.5-3b-instruct": 0.16,
"qwen2.5-7b-instruct": 0.316,
"qwen2.5-14b-instruct": 0.728,
"qwen2.5-32b-instruct": 0.756,
"qwen2.5-72b-instruct": 0.7200000000000001,
"llama-3.1-8b-instruct": 0.268,
"llama-3.1-70b-instruct": 0.5840000000000001,
"llama-3.2-3b-instruct": 0.10800000000000001,
"llama-3.3-70b-instruct": 0.548,
"mistral-large-instruct-2411": 0.752,
"gemma-2-27b-it": 0.252,
"gemma-2-9b-it": 0.12,
"deepseek-v3": 0.916,
"deepseek-r1": 0.9,
"qwq-32b": 0.916,
"Average": 0.5362666666666667
},
"GeologicalEnv": {
"qwen2.5-3b-instruct": 0.132,
"qwen2.5-7b-instruct": 0.336,
"qwen2.5-14b-instruct": 0.7639999999999999,
"qwen2.5-32b-instruct": 0.748,
"qwen2.5-72b-instruct": 0.676,
"llama-3.1-8b-instruct": 0.28800000000000003,
"llama-3.1-70b-instruct": 0.552,
"llama-3.2-3b-instruct": 0.13999999999999999,
"llama-3.3-70b-instruct": 0.508,
"mistral-large-instruct-2411": 0.812,
"gemma-2-27b-it": 0.41600000000000004,
"gemma-2-9b-it": 0.164,
"deepseek-v3": 0.9119999999999999,
"deepseek-r1": 0.8480000000000001,
"qwq-32b": 0.8880000000000001,
"Average": 0.5456000000000001
},
"TheatricalEnv": {
"qwen2.5-3b-instruct": 0.14400000000000002,
"qwen2.5-7b-instruct": 0.42400000000000004,
"qwen2.5-14b-instruct": 0.676,
"qwen2.5-32b-instruct": 0.78,
"qwen2.5-72b-instruct": 0.808,
"llama-3.1-8b-instruct": 0.41200000000000003,
"llama-3.1-70b-instruct": 0.7959999999999999,
"llama-3.2-3b-instruct": 0.1,
"llama-3.3-70b-instruct": 0.768,
"mistral-large-instruct-2411": 0.844,
"gemma-2-27b-it": 0.528,
"gemma-2-9b-it": 0.28,
"deepseek-v3": 0.884,
"deepseek-r1": 0.8240000000000001,
"qwq-32b": 0.908,
"Average": 0.6117333333333335
},
"PrintingTechniqueEnv": {
"qwen2.5-3b-instruct": 0.144,
"qwen2.5-7b-instruct": 0.252,
"qwen2.5-14b-instruct": 0.736,
"qwen2.5-32b-instruct": 0.7200000000000001,
"qwen2.5-72b-instruct": 0.776,
"llama-3.1-8b-instruct": 0.4,
"llama-3.1-70b-instruct": 0.54,
"llama-3.2-3b-instruct": 0.16,
"llama-3.3-70b-instruct": 0.548,
"mistral-large-instruct-2411": 0.7040000000000001,
"gemma-2-27b-it": 0.44000000000000006,
"gemma-2-9b-it": 0.192,
"deepseek-v3": 0.916,
"deepseek-r1": 0.852,
"qwq-32b": 0.9279999999999999,
"Average": 0.5538666666666666
},
"StellarEnv": {
"qwen2.5-3b-instruct": 0.132,
"qwen2.5-7b-instruct": 0.388,
"qwen2.5-14b-instruct": 0.6759999999999999,
"qwen2.5-32b-instruct": 0.724,
"qwen2.5-72b-instruct": 0.6960000000000001,
"llama-3.1-8b-instruct": 0.30000000000000004,
"llama-3.1-70b-instruct": 0.6040000000000001,
"llama-3.2-3b-instruct": 0.16,
"llama-3.3-70b-instruct": 0.6240000000000001,
"mistral-large-instruct-2411": 0.732,
"gemma-2-27b-it": 0.364,
"gemma-2-9b-it": 0.23199999999999998,
"deepseek-v3": 0.82,
"deepseek-r1": 0.648,
"qwq-32b": 0.776,
"Average": 0.5250666666666667
},
"SoilEnv": {
"qwen2.5-3b-instruct": 0.172,
"qwen2.5-7b-instruct": 0.48,
"qwen2.5-14b-instruct": 0.8320000000000001,
"qwen2.5-32b-instruct": 0.788,
"qwen2.5-72b-instruct": 0.8240000000000001,
"llama-3.1-8b-instruct": 0.42400000000000004,
"llama-3.1-70b-instruct": 0.64,
"llama-3.2-3b-instruct": 0.22799999999999998,
"llama-3.3-70b-instruct": 0.664,
"mistral-large-instruct-2411": 0.76,
"gemma-2-27b-it": 0.628,
"gemma-2-9b-it": 0.44000000000000006,
"deepseek-v3": 0.884,
"deepseek-r1": 0.8039999999999999,
"qwq-32b": 0.8480000000000001,
"Average": 0.6277333333333334
},
"SoftwareEnv": {
"qwen2.5-3b-instruct": 0.14800000000000002,
"qwen2.5-7b-instruct": 0.40800000000000003,
"qwen2.5-14b-instruct": 0.744,
"qwen2.5-32b-instruct": 0.86,
"qwen2.5-72b-instruct": 0.8400000000000001,
"llama-3.1-8b-instruct": 0.4159999999999999,
"llama-3.1-70b-instruct": 0.72,
"llama-3.2-3b-instruct": 0.16799999999999998,
"llama-3.3-70b-instruct": 0.784,
"mistral-large-instruct-2411": 0.804,
"gemma-2-27b-it": 0.528,
"gemma-2-9b-it": 0.308,
"deepseek-v3": 0.836,
"deepseek-r1": 0.8360000000000001,
"qwq-32b": 0.8800000000000001,
"Average": 0.6186666666666667
},
"CarIdentificationEnv": {
"qwen2.5-3b-instruct": 0.272,
"qwen2.5-7b-instruct": 0.4,
"qwen2.5-14b-instruct": 0.9120000000000001,
"qwen2.5-32b-instruct": 0.916,
"qwen2.5-72b-instruct": 0.9359999999999999,
"llama-3.1-8b-instruct": 0.544,
"llama-3.1-70b-instruct": 0.8400000000000001,
"llama-3.2-3b-instruct": 0.124,
"llama-3.3-70b-instruct": 0.852,
"mistral-large-instruct-2411": 0.9119999999999999,
"gemma-2-27b-it": 0.672,
"gemma-2-9b-it": 0.376,
"deepseek-v3": 0.992,
"deepseek-r1": 0.952,
"qwq-32b": 0.9879999999999999,
"Average": 0.7125333333333334
},
"PharmaceuticalEnv": {
"qwen2.5-3b-instruct": 0.156,
"qwen2.5-7b-instruct": 0.32,
"qwen2.5-14b-instruct": 0.7600000000000001,
"qwen2.5-32b-instruct": 0.752,
"qwen2.5-72b-instruct": 0.7559999999999999,
"llama-3.1-8b-instruct": 0.28400000000000003,
"llama-3.1-70b-instruct": 0.508,
"llama-3.2-3b-instruct": 0.148,
"llama-3.3-70b-instruct": 0.472,
"mistral-large-instruct-2411": 0.756,
"gemma-2-27b-it": 0.336,
"gemma-2-9b-it": 0.128,
"deepseek-v3": 0.8800000000000001,
"deepseek-r1": 0.8640000000000001,
"qwq-32b": 0.8,
"Average": 0.528
},
"NetworkEnv": {
"qwen2.5-3b-instruct": 0.184,
"qwen2.5-7b-instruct": 0.36,
"qwen2.5-14b-instruct": 0.66,
"qwen2.5-32b-instruct": 0.716,
"qwen2.5-72b-instruct": 0.716,
"llama-3.1-8b-instruct": 0.43199999999999994,
"llama-3.1-70b-instruct": 0.68,
"llama-3.2-3b-instruct": 0.14400000000000002,
"llama-3.3-70b-instruct": 0.7040000000000001,
"mistral-large-instruct-2411": 0.78,
"gemma-2-27b-it": 0.492,
"gemma-2-9b-it": 0.392,
"deepseek-v3": 0.8400000000000001,
"deepseek-r1": 0.736,
"qwq-32b": 0.828,
"Average": 0.5776
},
"BirdNestEnv": {
"qwen2.5-3b-instruct": 0.148,
"qwen2.5-7b-instruct": 0.21200000000000002,
"qwen2.5-14b-instruct": 0.48,
"qwen2.5-32b-instruct": 0.33999999999999997,
"qwen2.5-72b-instruct": 0.42400000000000004,
"llama-3.1-8b-instruct": 0.16799999999999998,
"llama-3.1-70b-instruct": 0.22400000000000003,
"llama-3.2-3b-instruct": 0.084,
"llama-3.3-70b-instruct": 0.20800000000000002,
"mistral-large-instruct-2411": 0.492,
"gemma-2-27b-it": 0.176,
"gemma-2-9b-it": 0.128,
"deepseek-v3": 0.764,
"deepseek-r1": 0.756,
"qwq-32b": 0.8119999999999999,
"Average": 0.36106666666666676
},
"EnergyEnv": {
"qwen2.5-3b-instruct": 0.15999999999999998,
"qwen2.5-7b-instruct": 0.42000000000000004,
"qwen2.5-14b-instruct": 0.7999999999999999,
"qwen2.5-32b-instruct": 0.7,
"qwen2.5-72b-instruct": 0.5880000000000001,
"llama-3.1-8b-instruct": 0.29600000000000004,
"llama-3.1-70b-instruct": 0.46799999999999997,
"llama-3.2-3b-instruct": 0.18,
"llama-3.3-70b-instruct": 0.396,
"mistral-large-instruct-2411": 0.78,
"gemma-2-27b-it": 0.35200000000000004,
"gemma-2-9b-it": 0.196,
"deepseek-v3": 0.916,
"deepseek-r1": 0.8720000000000001,
"qwq-32b": 0.8880000000000001,
"Average": 0.5341333333333333
},
"LanguageEnv": {
"qwen2.5-3b-instruct": 0.196,
"qwen2.5-7b-instruct": 0.304,
"qwen2.5-14b-instruct": 0.388,
"qwen2.5-32b-instruct": 0.512,
"qwen2.5-72b-instruct": 0.5599999999999999,
"llama-3.1-8b-instruct": 0.23200000000000004,
"llama-3.1-70b-instruct": 0.40800000000000003,
"llama-3.2-3b-instruct": 0.144,
"llama-3.3-70b-instruct": 0.336,
"mistral-large-instruct-2411": 0.536,
"gemma-2-27b-it": 0.20800000000000002,
"gemma-2-9b-it": 0.172,
"deepseek-v3": 0.724,
"deepseek-r1": 0.716,
"qwq-32b": 0.8119999999999999,
"Average": 0.41653333333333337
},
"AlgorithmEnv": {
"qwen2.5-3b-instruct": 0.1,
"qwen2.5-7b-instruct": 0.28400000000000003,
"qwen2.5-14b-instruct": 0.688,
"qwen2.5-32b-instruct": 0.6960000000000001,
"qwen2.5-72b-instruct": 0.66,
"llama-3.1-8b-instruct": 0.35200000000000004,
"llama-3.1-70b-instruct": 0.512,
"llama-3.2-3b-instruct": 0.22399999999999998,
"llama-3.3-70b-instruct": 0.484,
"mistral-large-instruct-2411": 0.788,
"gemma-2-27b-it": 0.268,
"gemma-2-9b-it": 0.164,
"deepseek-v3": 0.792,
"deepseek-r1": 0.724,
"qwq-32b": 0.812,
"Average": 0.5032
},
"MathematicalEnv": {
"qwen2.5-3b-instruct": 0.048,
"qwen2.5-7b-instruct": 0.42800000000000005,
"qwen2.5-14b-instruct": 0.7000000000000001,
"qwen2.5-32b-instruct": 0.8119999999999999,
"qwen2.5-72b-instruct": 0.792,
"llama-3.1-8b-instruct": 0.316,
"llama-3.1-70b-instruct": 0.8,
"llama-3.2-3b-instruct": 0.12800000000000003,
"llama-3.3-70b-instruct": 0.8400000000000001,
"mistral-large-instruct-2411": 0.884,
"gemma-2-27b-it": 0.268,
"gemma-2-9b-it": 0.068,
"deepseek-v3": 0.9119999999999999,
"deepseek-r1": 0.876,
"qwq-32b": 0.8160000000000001,
"Average": 0.5792
},
"MusicalEnv": {
"qwen2.5-3b-instruct": 0.04,
"qwen2.5-7b-instruct": 0.336,
"qwen2.5-14b-instruct": 0.8039999999999999,
"qwen2.5-32b-instruct": 0.8560000000000001,
"qwen2.5-72b-instruct": 0.8400000000000001,
"llama-3.1-8b-instruct": 0.34400000000000003,
"llama-3.1-70b-instruct": 0.68,
"llama-3.2-3b-instruct": 0.088,
"llama-3.3-70b-instruct": 0.8240000000000001,
"mistral-large-instruct-2411": 0.884,
"gemma-2-27b-it": 0.28,
"gemma-2-9b-it": 0.11599999999999999,
"deepseek-v3": 0.9480000000000001,
"deepseek-r1": 0.892,
"qwq-32b": 0.9039999999999999,
"Average": 0.5890666666666668
},
"InventorEnv": {
"qwen2.5-3b-instruct": 0.14800000000000002,
"qwen2.5-7b-instruct": 0.43200000000000005,
"qwen2.5-14b-instruct": 0.776,
"qwen2.5-32b-instruct": 0.7999999999999999,
"qwen2.5-72b-instruct": 0.772,
"llama-3.1-8b-instruct": 0.4,
"llama-3.1-70b-instruct": 0.7,
"llama-3.2-3b-instruct": 0.188,
"llama-3.3-70b-instruct": 0.616,
"mistral-large-instruct-2411": 0.8039999999999999,
"gemma-2-27b-it": 0.552,
"gemma-2-9b-it": 0.364,
"deepseek-v3": 0.9399999999999998,
"deepseek-r1": 0.908,
"qwq-32b": 0.9,
"Average": 0.62
},
"MedicalEnv": {
"qwen2.5-3b-instruct": 0.22000000000000003,
"qwen2.5-7b-instruct": 0.544,
"qwen2.5-14b-instruct": 0.8320000000000001,
"qwen2.5-32b-instruct": 0.8800000000000001,
"qwen2.5-72b-instruct": 0.8960000000000001,
"llama-3.1-8b-instruct": 0.52,
"llama-3.1-70b-instruct": 0.82,
"llama-3.2-3b-instruct": 0.23200000000000004,
"llama-3.3-70b-instruct": 0.8960000000000001,
"mistral-large-instruct-2411": 0.8960000000000001,
"gemma-2-27b-it": 0.692,
"gemma-2-9b-it": 0.5760000000000001,
"deepseek-v3": 0.9039999999999999,
"deepseek-r1": 0.9359999999999999,
"qwq-32b": 0.9199999999999999,
"Average": 0.7175999999999999
},
"MusicEnv": {
"qwen2.5-3b-instruct": 0.184,
"qwen2.5-7b-instruct": 0.26,
"qwen2.5-14b-instruct": 0.656,
"qwen2.5-32b-instruct": 0.664,
"qwen2.5-72b-instruct": 0.7559999999999999,
"llama-3.1-8b-instruct": 0.356,
"llama-3.1-70b-instruct": 0.596,
"llama-3.2-3b-instruct": 0.10800000000000001,
"llama-3.3-70b-instruct": 0.596,
"mistral-large-instruct-2411": 0.6639999999999999,
"gemma-2-27b-it": 0.45600000000000007,
"gemma-2-9b-it": 0.28400000000000003,
"deepseek-v3": 0.8119999999999999,
"deepseek-r1": 0.868,
"qwq-32b": 0.868,
"Average": 0.5418666666666667
},
"FantasyEnv": {
"qwen2.5-3b-instruct": 0.148,
"qwen2.5-7b-instruct": 0.32,
"qwen2.5-14b-instruct": 0.74,
"qwen2.5-32b-instruct": 0.7879999999999999,
"qwen2.5-72b-instruct": 0.5720000000000001,
"llama-3.1-8b-instruct": 0.40800000000000003,
"llama-3.1-70b-instruct": 0.676,
"llama-3.2-3b-instruct": 0.152,
"llama-3.3-70b-instruct": 0.704,
"mistral-large-instruct-2411": 0.8240000000000001,
"gemma-2-27b-it": 0.524,
"gemma-2-9b-it": 0.324,
"deepseek-v3": 0.9199999999999999,
"deepseek-r1": 0.9719999999999999,
"qwq-32b": 0.9719999999999999,
"Average": 0.6029333333333332
},
"EducationEnv": {
"qwen2.5-3b-instruct": 0.10400000000000001,
"qwen2.5-7b-instruct": 0.268,
"qwen2.5-14b-instruct": 0.828,
"qwen2.5-32b-instruct": 0.9039999999999999,
"qwen2.5-72b-instruct": 0.8480000000000001,
"llama-3.1-8b-instruct": 0.5680000000000001,
"llama-3.1-70b-instruct": 0.768,
"llama-3.2-3b-instruct": 0.192,
"llama-3.3-70b-instruct": 0.9039999999999999,
"mistral-large-instruct-2411": 0.876,
"gemma-2-27b-it": 0.624,
"gemma-2-9b-it": 0.45999999999999996,
"deepseek-v3": 0.9480000000000001,
"deepseek-r1": 0.9,
"qwq-32b": 0.9359999999999999,
"Average": 0.6752
},
"ChemicalEnv": {
"qwen2.5-3b-instruct": 0.264,
"qwen2.5-7b-instruct": 0.44000000000000006,
"qwen2.5-14b-instruct": 0.724,
"qwen2.5-32b-instruct": 0.7040000000000001,
"qwen2.5-72b-instruct": 0.72,
"llama-3.1-8b-instruct": 0.36,
"llama-3.1-70b-instruct": 0.62,
"llama-3.2-3b-instruct": 0.16399999999999998,
"llama-3.3-70b-instruct": 0.45999999999999996,
"mistral-large-instruct-2411": 0.68,
"gemma-2-27b-it": 0.44399999999999995,
"gemma-2-9b-it": 0.316,
"deepseek-v3": 0.8799999999999999,
"deepseek-r1": 0.6799999999999999,
"qwq-32b": 0.8200000000000001,
"Average": 0.5517333333333333
},
"Average": {
"qwen2.5-3b-instruct": 0.1655841584158416,
"qwen2.5-7b-instruct": 0.34736633663366323,
"qwen2.5-14b-instruct": 0.7148514851485149,
"qwen2.5-32b-instruct": 0.7330693069306928,
"qwen2.5-72b-instruct": 0.7272079207920793,
"llama-3.1-8b-instruct": 0.3334653465346535,
"llama-3.1-70b-instruct": 0.6271287128712871,
"llama-3.2-3b-instruct": 0.15599999999999997,
"llama-3.3-70b-instruct": 0.6372277227722771,
"mistral-large-instruct-2411": 0.7573861386138615,
"gemma-2-27b-it": 0.44522772277227735,
"gemma-2-9b-it": 0.264,
"deepseek-v3": 0.8605148514851484,
"deepseek-r1": 0.8304554455445546,
"qwq-32b": 0.8630891089108911
}
}