scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4222222222222222,0.10831349206349207 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9438798074485389,0.0001621317520439264 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9888264649460883,7.772240795323086e-05 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6751492859105923,0.008201584218040297 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.828078671210825,0.001082228864258374 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8539864924534399,0.0006436975254696865 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7501937734175208,0.0029250200956793346 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.35957325998039574,0.15076277502664528 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6592611948214576,0.008926875535053643 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5393598899705937,0.03114121059579671 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.19999999999999998,0.4843127204585538 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.06666666666666667,0.8618005952380953 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.19999999999999998,0.4843127204585538 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.24444444444444444,0.38071979717813054 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3680349649825889,0.14634982666257293 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.40451991747794525,0.1059975484249457 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3864634590332683,0.12531568180831723 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.24444444444444444,0.38071979717813054 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5393598899705937,0.03114121059579671 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607915,0.004057136032371292 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.3333333333333333,0.21637345679012346 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.3333333333333333,0.21637345679012346 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.37777777777777777,0.1557418430335097 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.3333333333333333,0.21637345679012346 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.24444444444444444,0.38071979717813054 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.28888888888888886,0.2912483465608466 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.28888888888888886,0.2912483465608466 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.19999999999999998,0.4843127204585538 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.28888888888888886,0.2912483465608466 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.28888888888888886,0.2912483465608466 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.3333333333333333,0.21637345679012346 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.28888888888888886,0.2912483465608466 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.37777777777777777,0.1557418430335097 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.37777777777777777,0.1557418430335097 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4222222222222222,0.10831349206349207 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9555555555555554,5.5114638447971785e-06 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,9,0.9555555555555554,5.5114638447971785e-06 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,5,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7640931774583409,0.002263469812035174 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6292532049656926,0.011921001496914019 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7820743005880014,0.002024186938238222 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9200874124564722,0.00028192590783336483 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6741998624632421,0.0070583320485280866 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6292532049656926,0.011921001496914019 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7191465199607915,0.004057136032371292 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6741998624632421,0.0070583320485280866 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5520524474738833,0.029345108174841844 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7640931774583409,0.002263469812035174 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6292532049656926,0.011921001496914019 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4222222222222222,0.10831349206349207 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7640931774583409,0.002263469812035174 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.37777777777777777,0.1557418430335097 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7191465199607915,0.004057136032371292 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6751492859105923,0.008201584218040297 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5843065474681431,0.019550269092885535 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7191465199607915,0.004057136032371292 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8539864924534399,0.0006436975254696865 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7191465199607915,0.004057136032371292 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.49441323247304414,0.048193488293190756 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7640931774583409,0.002263469812035174 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7191465199607915,0.004057136032371292 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5843065474681431,0.019550269092885535 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6292532049656926,0.011921001496914019 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8090398349558905,0.0012254240706707103 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.37777777777777777,0.1557418430335097 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.28888888888888886,0.2912483465608466 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.28888888888888886,0.2912483465608466 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.28888888888888886,0.2912483465608466 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4222222222222222,0.10831349206349207 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.3333333333333333,0.21637345679012346 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.3333333333333333,0.21637345679012346 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4222222222222222,0.10831349206349207 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4666666666666666,0.07255015432098766 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.3333333333333333,0.21637345679012346 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4222222222222222,0.10831349206349207 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.3333333333333333,0.21637345679012346 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.3333333333333333,0.21637345679012346 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4222222222222222,0.10831349206349207 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5843065474681431,0.019550269092885535 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7640931774583409,0.002263469812035174 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9888264649460883,7.772240795323086e-05 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9438798074485389,0.0001621317520439264 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9438798074485389,0.0001621317520439264 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8539864924534399,0.0006436975254696865 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6741998624632421,0.0070583320485280866 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9438798074485389,0.0001621317520439264 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8989331499509894,0.0003280163150135276 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8989331499509894,0.0003280163150135276 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8539864924534399,0.0006436975254696865 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.37777777777777777,0.1557418430335097 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.3333333333333333,0.21637345679012346 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.3333333333333333,0.21637345679012346 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.28888888888888886,0.2912483465608466 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.022222222222222223,1.0 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.37777777777777777,0.1557418430335097 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.24444444444444444,0.38071979717813054 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.3333333333333333,0.21637345679012346 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.40451991747794525,0.1059975484249457 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5843065474681431,0.019550269092885535 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5843065474681431,0.019550269092885535 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.27602622373694163,0.27597331801373814 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6292532049656926,0.011921001496914019 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.3220305943597653,0.20373653988950713 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6292532049656926,0.011921001496914019 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607915,0.004057136032371292 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.28888888888888886,0.2912483465608466 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.24444444444444444,0.38071979717813054 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5843065474681431,0.019550269092885535 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.28888888888888886,0.2912483465608466 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.3333333333333333,0.21637345679012346 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4222222222222222,0.10831349206349207 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.35957325998039574,0.15076277502664528 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4494665749754947,0.07248608508684644 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4666666666666666,0.07255015432098766 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6292532049656926,0.011921001496914019 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.40451991747794525,0.1059975484249457 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.19999999999999998,0.4843127204585538 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.37777777777777777,0.1557418430335097 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.3146266024828463,0.2086677876982641 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.24444444444444444,0.38071979717813054 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.28888888888888886,0.2912483465608466 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.37777777777777777,0.1557418430335097 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.28888888888888886,0.2912483465608466 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.3333333333333333,0.21637345679012346 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8539864924534399,0.0006436975254696865 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8989331499509894,0.0003280163150135276 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8539864924534399,0.0006436975254696865 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8090398349558905,0.0012254240706707103 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8090398349558905,0.0012254240706707103 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8539864924534399,0.0006436975254696865 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8539864924534399,0.0006436975254696865 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8090398349558905,0.0012254240706707103 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.8539864924534399,0.0006436975254696865 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.7191465199607915,0.004057136032371292 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6137949055234262,0.01491204965521593 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.828078671210825,0.001116926714212636 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.8740830418336486,0.0005809444435783661 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.5843065474681431,0.019550269092885535 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.5393598899705937,0.03114121059579671 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.8865926413116155,0.000437257592706733 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.9438798074485389,0.0001621317520439264 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.9888264649460883,7.772240795323086e-05 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.6751492859105923,0.008201584218040297 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.828078671210825,0.001082228864258374 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.7501937734175208,0.0029250200956793346 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.35957325998039574,0.15076277502664528 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.6592611948214576,0.008926875535053643 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.5393598899705937,0.03114121059579671 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.19999999999999998,0.4843127204585538 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.06666666666666667,0.8618005952380953 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.19999999999999998,0.4843127204585538 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.24444444444444444,0.38071979717813054 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.3680349649825889,0.14634982666257293 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.40451991747794525,0.1059975484249457 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.3864634590332683,0.12531568180831723 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.24444444444444444,0.38071979717813054 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.5393598899705937,0.03114121059579671 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.7191465199607915,0.004057136032371292 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,0,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,1,0.24444444444444444,0.38071979717813054 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,2,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,3,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,4,0.19999999999999998,0.4843127204585538 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,5,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,6,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,8,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,9,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,5,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,9,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,2,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,0,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,9,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,3,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,5,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,6,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,2,0.7640931774583409,0.002263469812035174 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,9,0.6292532049656926,0.011921001496914019 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,3,0.7820743005880014,0.002024186938238222 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,4,0.9200874124564722,0.00028192590783336483 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,5,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,6,0.6292532049656926,0.011921001496914019 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,7,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,8,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,9,0.5520524474738833,0.029345108174841844 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,0,0.7640931774583409,0.002263469812035174 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,9,0.6292532049656926,0.011921001496914019 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,2,0.4222222222222222,0.10831349206349207 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,7,0.7640931774583409,0.002263469812035174 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,9,0.37777777777777777,0.1557418430335097 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,1,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,2,0.6751492859105923,0.008201584218040297 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,3,0.5843065474681431,0.019550269092885535 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,5,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,8,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,9,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,9,0.49441323247304414,0.048193488293190756 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,0,0.7640931774583409,0.002263469812035174 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,1,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,2,0.5843065474681431,0.019550269092885535 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,9,0.6292532049656926,0.011921001496914019 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,0,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,1,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,2,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,3,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,4,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,8,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,9,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,0,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,1,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,2,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,3,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,5,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,6,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,8,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,9,0.4222222222222222,0.10831349206349207 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,0,0.5843065474681431,0.019550269092885535 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,1,0.7640931774583409,0.002263469812035174 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,0,0.9888264649460883,7.772240795323086e-05 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,1,0.9438798074485389,0.0001621317520439264 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,7,0.9438798074485389,0.0001621317520439264 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,0,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,1,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,7,0.9438798074485389,0.0001621317520439264 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,0,0.8989331499509894,0.0003280163150135276 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,3,0.8989331499509894,0.0003280163150135276 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,8,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.28888888888888886,0.2912483465608466 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.022222222222222223,1.0 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.24444444444444444,0.38071979717813054 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,9,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,9,0.40451991747794525,0.1059975484249457 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,0,0.5843065474681431,0.019550269092885535 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,1,0.5843065474681431,0.019550269092885535 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,2,0.27602622373694163,0.27597331801373814 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,3,0.6292532049656926,0.011921001496914019 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,6,0.3220305943597653,0.20373653988950713 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,7,0.6292532049656926,0.011921001496914019 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,8,0.7191465199607915,0.004057136032371292 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,9,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,2,0.24444444444444444,0.38071979717813054 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,5,0.5843065474681431,0.019550269092885535 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,6,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,9,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,2,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,9,0.35957325998039574,0.15076277502664528 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,1,0.4494665749754947,0.07248608508684644 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,2,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,4,0.6292532049656926,0.011921001496914019 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,5,0.40451991747794525,0.1059975484249457 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,6,0.19999999999999998,0.4843127204585538 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,7,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,9,0.3146266024828463,0.2086677876982641 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,2,0.24444444444444444,0.38071979717813054 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,6,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,7,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,9,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,9,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,0,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,1,0.8989331499509894,0.0003280163150135276 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,3,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,4,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,5,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,7,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,8,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,9,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,1,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,2,0.7191465199607915,0.004057136032371292 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,3,0.6137949055234262,0.01491204965521593 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,5,0.828078671210825,0.001116926714212636 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,6,0.8740830418336486,0.0005809444435783661 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,7,0.5843065474681431,0.019550269092885535 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,8,0.5393598899705937,0.03114121059579671 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,9,0.8865926413116155,0.000437257592706733 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552