scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,0,-0.06666666666666667,0.8618005952380953 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,1,0.19999999999999998,0.4843127204585538 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,3,0.24444444444444444,0.38071979717813054 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,4,0.24444444444444444,0.38071979717813054 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,6,0.19999999999999998,0.4843127204585538 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,7,0.24444444444444444,0.38071979717813054 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,8,0.24444444444444444,0.38071979717813054 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,9,0.022222222222222223,1.0 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.24444444444444444,0.38071979717813054 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.06666666666666667,0.8618005952380953 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,-0.15555555555555553,0.6006536596119929 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.1111111111111111,0.7274895282186948 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,-0.022222222222222223,1.0 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3146266024828463,0.2086677876982641 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.35957325998039574,0.15076277502664528 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.19999999999999998,0.4843127204585538 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.022222222222222223,1.0 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.022222222222222223,1.0 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.24444444444444444,0.38071979717813054 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6741998624632421,0.0070583320485280866 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.26967994498529685,0.2811980995641792 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.37777777777777777,0.1557418430335097 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5843065474681431,0.019550269092885535 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4494665749754947,0.07248608508684644 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5393598899705937,0.03114121059579671 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.3333333333333333,0.21637345679012346 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.28888888888888886,0.2912483465608466 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4222222222222222,0.10831349206349207 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.3333333333333333,0.21637345679012346 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5393598899705937,0.03114121059579671 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7191465199607915,0.004057136032371292 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6292532049656926,0.011921001496914019 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.49441323247304414,0.048193488293190756 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9999999999999999,5.511463844797178e-07 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9888264649460883,7.772240795323086e-05 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7640931774583409,0.002263469812035174 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7956600627155523,0.0016003458153453507 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.841126352013584,0.0008493886834182052 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7191465199607915,0.004057136032371292 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8539864924534399,0.0006436975254696865 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9320589306096472,0.00021829510003985143 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6741998624632421,0.0070583320485280866 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8090398349558905,0.0012254240706707103 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,-0.06666666666666667,0.8618005952380953 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.24444444444444444,0.38071979717813054 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.1111111111111111,0.7274895282186948 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5843065474681431,0.019550269092885535 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4222222222222222,0.10831349206349207 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.13483997249264842,0.590013887163346 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.08989331499509894,0.7194375444233914 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.26967994498529685,0.2811980995641792 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6741998624632421,0.0070583320485280866 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.06666666666666667,0.8618005952380953 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.022222222222222223,1.0 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,-0.022222222222222223,1.0 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4222222222222222,0.10831349206349207 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.022222222222222223,1.0 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,-0.1111111111111111,0.7274895282186948 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4494665749754947,0.07248608508684644 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6292532049656926,0.011921001496914019 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7191465199607915,0.004057136032371292 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7191465199607915,0.004057136032371292 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3146266024828463,0.2086677876982641 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607915,0.004057136032371292 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6137949055234262,0.01491204965521593 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7956600627155523,0.0016003458153453507 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5520524474738833,0.029345108174841844 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7501937734175208,0.0029250200956793346 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6292532049656926,0.011921001496914019 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8539864924534399,0.0006436975254696865 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.3333333333333333,0.21637345679012346 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4222222222222222,0.10831349206349207 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6741998624632421,0.0070583320485280866 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7820743005880014,0.002024186938238222 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6741998624632421,0.0070583320485280866 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7640931774583409,0.002263469812035174 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7191465199607915,0.004057136032371292 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.19999999999999998,0.4843127204585538 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5393598899705937,0.03114121059579671 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 helm_hellaswag,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 helm_hellaswag,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 helm_hellaswag,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 helm_hellaswag,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 helm_hellaswag,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 helm_hellaswag,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 helm_hellaswag,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 helm_hellaswag,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07 helm_hellaswag,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8539864924534399,0.0006436975254696865 helm_hellaswag,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.2500645911391736,0.3212772414140661 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.26967994498529685,0.2811980995641792 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4494665749754947,0.07248608508684644 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.15555555555555553,0.6006536596119929 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.29553088043720516,0.24112859961644273 helm_truthfulqa,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,-0.06666666666666667,0.8618005952380953 helm_truthfulqa,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.19999999999999998,0.4843127204585538 helm_truthfulqa,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.06666666666666667,0.8618005952380953 helm_truthfulqa,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.3333333333333333,0.21637345679012346 helm_truthfulqa,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.31462660248284624,0.2086677876982641 helm_truthfulqa,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 helm_truthfulqa,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207 helm_truthfulqa,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,-0.13483997249264842,0.590013887163346 helm_truthfulqa,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.3595732599803958,0.15076277502664528 helm_truthfulqa,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 Helm MSMARCO Regular,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 Helm MSMARCO Regular,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 Helm MSMARCO Regular,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 Helm MSMARCO Regular,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 Helm MSMARCO Regular,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 Helm MSMARCO Regular,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 Helm MSMARCO Regular,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 Helm MSMARCO Regular,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 Helm MSMARCO Regular,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 Helm MSMARCO Regular,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 Helm MSMARCO Trec,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 Helm MSMARCO Trec,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 Helm MSMARCO Trec,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346 Helm MSMARCO Trec,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 Helm MSMARCO Trec,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 Helm MSMARCO Trec,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,-0.06666666666666667,0.8618005952380953 Helm MSMARCO Trec,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 Helm MSMARCO Trec,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 Helm MSMARCO Trec,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 Helm MSMARCO Trec,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.3333333333333333,0.21637345679012346 helm_cnn/dailymail,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6741998624632421,0.0070583320485280866 helm_cnn/dailymail,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8989331499509895,0.0003280163150135276 helm_cnn/dailymail,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 helm_cnn/dailymail,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8090398349558905,0.0012254240706707103 helm_cnn/dailymail,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8090398349558905,0.0012254240706707103 helm_cnn/dailymail,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 helm_cnn/dailymail,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7360699299651778,0.0036714498212758067 helm_cnn/dailymail,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 helm_cnn/dailymail,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607916,0.004057136032371292 helm_cnn/dailymail,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7047274841194893,0.005187148855929351 Helm XSUM,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4494665749754947,0.07248608508684644 Helm XSUM,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.15555555555555553,0.6006536596119929 Helm XSUM,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346 Helm XSUM,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.3333333333333333,0.21637345679012346 Helm XSUM,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.49441323247304414,0.048193488293190756 Helm XSUM,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5843065474681431,0.019550269092885535 Helm XSUM,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 Helm XSUM,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466 Helm XSUM,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.15555555555555553,0.6006536596119929 Helm XSUM,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.15555555555555553,0.6006536596119929 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4222222222222222,0.10831349206349207 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,-0.022222222222222223,1.0 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6137949055234262,0.01491204965521593 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.15555555555555553,0.6006536596119929 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.26967994498529685,0.2811980995641792 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8989331499509894,0.0003280163150135276 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.26967994498529685,0.2811980995641792 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6741998624632421,0.0070583320485280866 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.40451991747794525,0.1059975484249457 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5843065474681431,0.019550269092885535 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.37777777777777777,0.1557418430335097 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4222222222222222,0.10831349206349207 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4666666666666666,0.07255015432098766 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6292532049656926,0.011921001496914019 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7191465199607915,0.004057136032371292 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4494665749754947,0.07248608508684644 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6292532049656926,0.011921001496914019 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4666666666666666,0.07255015432098766 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.24444444444444444,0.38071979717813054 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.19999999999999998,0.4843127204585538 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.3333333333333333,0.21637345679012346 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6292532049656926,0.011921001496914019 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5393598899705937,0.03114121059579671 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.37777777777777777,0.1557418430335097 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.3333333333333333,0.21637345679012346 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7640931774583409,0.002263469812035174 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.49441323247304414,0.048193488293190756 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7191465199607915,0.004057136032371292 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6292532049656926,0.011921001496914019 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4222222222222222,0.10831349206349207 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5843065474681431,0.019550269092885535 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7191465199607915,0.004057136032371292 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.2500645911391736,0.3212772414140661 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.522862326927363,0.03809415806109578 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.24444444444444444,0.38071979717813054 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.28888888888888886,0.2912483465608466 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.24444444444444444,0.38071979717813054 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5393598899705937,0.03114121059579671 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.24444444444444444,0.38071979717813054 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.49441323247304414,0.048193488293190756 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6741998624632421,0.0070583320485280866 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6741998624632421,0.0070583320485280866 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.40451991747794525,0.1059975484249457 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4222222222222222,0.10831349206349207 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5843065474681431,0.019550269092885535 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4494665749754947,0.07248608508684644 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.15555555555555553,0.6006536596119929 toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,, toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,, toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,, toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,, toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,, toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,, toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,, toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,, toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,, toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,, toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6292532049656925,0.011921001496914019 toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6292532049656925,0.011921001496914019 toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.764093177458341,0.002263469812035174 toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5393598899705937,0.03114121059579671 toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5393598899705937,0.03114121059579671 toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7191465199607916,0.004057136032371292 toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6741998624632421,0.0070583320485280866 toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6741998624632421,0.0070583320485280866 toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6741998624632421,0.0070583320485280866 toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7191465199607916,0.004057136032371292 toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6292532049656925,0.011921001496914019 toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607916,0.004057136032371292 toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6741998624632421,0.0070583320485280866 toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6292532049656925,0.011921001496914019 toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5393598899705937,0.03114121059579671 toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6741998624632421,0.0070583320485280866 toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5843065474681431,0.019550269092885535 toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4944132324730442,0.048193488293190756 toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.3333333333333333,0.21637345679012346 toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6292532049656925,0.011921001496914019 toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5843065474681431,0.019550269092885535 toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6292532049656925,0.011921001496914019 toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.2530855341217655,0.3648210653487137 toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.32539568672798425,0.2439639465303466 toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.2530855341217655,0.3648210653487137 toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.32539568672798425,0.2439639465303466 toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4700159919404217,0.09238008162314501 toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.32539568672798425,0.2439639465303466 toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.24845199749997662,0.384088249473852 toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.1807753815155468,0.517439239336394 toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.39770583933420295,0.15443051915129236 toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.32539568672798425,0.2439639465303466 toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.37796447300922725,0.14954135458461507 toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.37796447300922725,0.14954135458461507 toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3810317377662721,0.15446576442708193 toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5967623950328607,0.020456721550759976 toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.629940788348712,0.016309171877754967 toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6445033866354896,0.012304364739182175 toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4354648431614539,0.10366227301829671 toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.35805743701971643,0.16431086269141681 toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4354648431614539,0.10366227301829671 toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6445033866354896,0.012304364739182175 toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7191465199607916,0.004057136032371292 toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.764093177458341,0.002263469812035174 toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.764093177458341,0.002263469812035174 toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6741998624632421,0.0070583320485280866 toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5393598899705937,0.03114121059579671 toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.764093177458341,0.002263469812035174 toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6900655593423543,0.006458954266892998 toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6292532049656925,0.011921001496914019 toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.764093177458341,0.002263469812035174 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9999999999999999,5.511463844797178e-07 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,9,0.9555555555555554,5.5114638447971785e-06 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.35957325998039574,0.15076277502664528 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.37777777777777777,0.1557418430335097 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4222222222222222,0.10831349206349207 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.37777777777777777,0.1557418430335097 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.40451991747794525,0.1059975484249457 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7640931774583409,0.002263469812035174 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5843065474681431,0.019550269092885535 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5843065474681431,0.019550269092885535 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6292532049656926,0.011921001496914019 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.24444444444444444,0.38071979717813054 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7640931774583409,0.002263469812035174 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6292532049656926,0.011921001496914019 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7640931774583409,0.002263469812035174 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9438798074485389,0.0001621317520439264 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.37777777777777777,0.1557418430335097 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5393598899705937,0.03114121059579671 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.37777777777777777,0.1557418430335097 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.28888888888888886,0.2912483465608466 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4222222222222222,0.10831349206349207 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.24444444444444444,0.38071979717813054 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.3333333333333333,0.21637345679012346 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.37777777777777777,0.1557418430335097 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.3333333333333333,0.21637345679012346 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.37777777777777777,0.1557418430335097 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.3333333333333333,0.21637345679012346 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,0,0.49441323247304414,0.048193488293190756 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6292532049656926,0.011921001496914019 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,3,0.19999999999999998,0.4843127204585538 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5843065474681431,0.019550269092885535 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6292532049656926,0.011921001496914019 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5393598899705937,0.03114121059579671 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.841126352013584,0.0008493886834182052 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.9555555555555554,5.5114638447971785e-06 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.28888888888888886,0.2912483465608466 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4222222222222222,0.10831349206349207 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9438798074485389,0.0001621317520439264 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8090398349558905,0.0012254240706707103 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.37777777777777777,0.1557418430335097 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.49441323247304414,0.048193488293190756 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.37777777777777777,0.1557418430335097 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4222222222222222,0.10831349206349207 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4222222222222222,0.10831349206349207 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6741998624632421,0.0070583320485280866 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6741998624632421,0.0070583320485280866 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,-0.1111111111111111,0.7274895282186948 MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,-0.1111111111111111,0.7274895282186948 MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,-0.1111111111111111,0.7274895282186948 MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,-0.1111111111111111,0.7274895282186948 MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,-0.1111111111111111,0.7274895282186948 MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,-0.1111111111111111,0.7274895282186948 MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,-0.1111111111111111,0.7274895282186948 MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,-0.1111111111111111,0.7274895282186948 MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,-0.1111111111111111,0.7274895282186948 MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,-0.1111111111111111,0.7274895282186948 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.37777777777777777,0.1557418430335097 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.3333333333333333,0.21637345679012346 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.37777777777777777,0.1557418430335097 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.6292532049656926,0.011921001496914019 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.24444444444444444,0.38071979717813054 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.28888888888888886,0.2912483465608466 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.37777777777777777,0.1557418430335097 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.37777777777777777,0.1557418430335097 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.4666666666666666,0.07255015432098766 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.15555555555555553,0.6006536596119929 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.28888888888888886,0.2912483465608466 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.37777777777777777,0.1557418430335097 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.4494665749754947,0.07248608508684644 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.06666666666666667,0.8618005952380953 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.15555555555555553,0.6006536596119929 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4222222222222222,0.10831349206349207 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.3333333333333333,0.21637345679012346 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.19999999999999998,0.4843127204585538 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.20459830184114206,0.4170770595205646 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.17978662999019787,0.47249761400068846 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4222222222222222,0.10831349206349207 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4494665749754947,0.07248608508684644 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.19999999999999998,0.4843127204585538 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.1111111111111111,0.7274895282186948 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.49441323247304414,0.048193488293190756 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.28888888888888886,0.2912483465608466 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4494665749754947,0.07248608508684644 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.19999999999999998,0.4843127204585538 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3146266024828463,0.2086677876982641 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.24444444444444444,0.38071979717813054 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.24444444444444444,0.38071979717813054 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.22473328748774735,0.36917141633269157 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6292532049656926,0.011921001496914019 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.08989331499509894,0.7194375444233914 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6292532049656926,0.011921001496914019 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5393598899705937,0.03114121059579671 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.17978662999019787,0.47249761400068846 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.3333333333333333,0.21637345679012346 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.13483997249264842,0.590013887163346 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7047274841194893,0.005187148855929351 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.24444444444444444,0.38071979717813054 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5843065474681431,0.019550269092885535 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4222222222222222,0.10831349206349207 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.49441323247304414,0.048193488293190756 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.19999999999999998,0.4843127204585538 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.3333333333333333,0.21637345679012346 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.37777777777777777,0.1557418430335097 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4222222222222222,0.10831349206349207 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.28888888888888886,0.2912483465608466 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.22473328748774735,0.36917141633269157 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.04494665749754947,0.8574624419592412 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4222222222222222,0.10831349206349207 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.1111111111111111,0.7274895282186948 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.37777777777777777,0.1557418430335097 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.1111111111111111,0.7274895282186948 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.24444444444444444,0.38071979717813054 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.08989331499509894,0.7194375444233914 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.13483997249264842,0.590013887163346 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.1111111111111111,0.7274895282186948 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.06666666666666667,0.8618005952380953 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.3333333333333333,0.21637345679012346 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.17978662999019787,0.47249761400068846 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.1111111111111111,0.7274895282186948 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.24444444444444444,0.38071979717813054 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.24444444444444444,0.38071979717813054 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.1111111111111111,0.7274895282186948 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.22473328748774735,0.36917141633269157 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.15555555555555553,0.6006536596119929 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4222222222222222,0.10831349206349207 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.19999999999999998,0.4843127204585538 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5393598899705937,0.03114121059579671 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.37777777777777777,0.1557418430335097 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.15555555555555553,0.6006536596119929 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5393598899705937,0.03114121059579671 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3146266024828463,0.2086677876982641 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.08989331499509894,0.7194375444233914 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7640931774583409,0.002263469812035174 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.7956600627155523,0.0016003458153453507 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.8539864924534399,0.0006436975254696865 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.8539864924534399,0.0006436975254696865 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7640931774583409,0.002263469812035174 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7191465199607915,0.004057136032371292 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.7191465199607915,0.004057136032371292 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.8539864924534399,0.0006436975254696865 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.6440611887195306,0.01102341638822145 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6741998624632421,0.0070583320485280866 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.8989331499509894,0.0003280163150135276 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.8090398349558905,0.0012254240706707103 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.5843065474681431,0.019550269092885535 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.7047274841194893,0.005187148855929351 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.8539864924534399,0.0006436975254696865 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,0,-0.06666666666666667,0.8618005952380953 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,1,0.19999999999999998,0.4843127204585538 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,3,0.24444444444444444,0.38071979717813054 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,4,0.24444444444444444,0.38071979717813054 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,6,0.19999999999999998,0.4843127204585538 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,7,0.24444444444444444,0.38071979717813054 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,8,0.24444444444444444,0.38071979717813054 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,9,0.022222222222222223,1.0 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,0,0.24444444444444444,0.38071979717813054 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,2,0.06666666666666667,0.8618005952380953 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,3,-0.15555555555555553,0.6006536596119929 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,4,0.1111111111111111,0.7274895282186948 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,5,-0.022222222222222223,1.0 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,7,0.3146266024828463,0.2086677876982641 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,9,0.35957325998039574,0.15076277502664528 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,1,0.19999999999999998,0.4843127204585538 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,2,0.022222222222222223,1.0 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,5,0.022222222222222223,1.0 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,7,0.24444444444444444,0.38071979717813054 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,8,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,3,0.26967994498529685,0.2811980995641792 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,5,0.37777777777777777,0.1557418430335097 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,1,0.5843065474681431,0.019550269092885535 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,3,0.4494665749754947,0.07248608508684644 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,0,0.5393598899705937,0.03114121059579671 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,1,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,3,0.28888888888888886,0.2912483465608466 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,1,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,3,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,1,0.5393598899705937,0.03114121059579671 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,2,0.7191465199607915,0.004057136032371292 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,3,0.6292532049656926,0.011921001496914019 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,3,0.49441323247304414,0.048193488293190756 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.9888264649460883,7.772240795323086e-05 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.7640931774583409,0.002263469812035174 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.7956600627155523,0.0016003458153453507 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.841126352013584,0.0008493886834182052 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.7191465199607915,0.004057136032371292 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.9320589306096472,0.00021829510003985143 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,1,-0.06666666666666667,0.8618005952380953 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.24444444444444444,0.38071979717813054 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.1111111111111111,0.7274895282186948 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.5843065474681431,0.019550269092885535 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.13483997249264842,0.590013887163346 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.08989331499509894,0.7194375444233914 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.26967994498529685,0.2811980995641792 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.06666666666666667,0.8618005952380953 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.022222222222222223,1.0 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,3,-0.022222222222222223,1.0 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.022222222222222223,1.0 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,9,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,0,0.4494665749754947,0.07248608508684644 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,1,0.6292532049656926,0.011921001496914019 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,3,0.7191465199607915,0.004057136032371292 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,5,0.7191465199607915,0.004057136032371292 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,7,0.3146266024828463,0.2086677876982641 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,8,0.7191465199607915,0.004057136032371292 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,9,0.6137949055234262,0.01491204965521593 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,2,0.7956600627155523,0.0016003458153453507 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,3,0.5520524474738833,0.029345108174841844 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,5,0.7501937734175208,0.0029250200956793346 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,6,0.6292532049656926,0.011921001496914019 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,9,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,4,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,5,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,5,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,6,0.7820743005880014,0.002024186938238222 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,0,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,5,0.7640931774583409,0.002263469812035174 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,6,0.7191465199607915,0.004057136032371292 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,2,0.19999999999999998,0.4843127204585538 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,3,0.5393598899705937,0.03114121059579671 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,helm_hellaswag,helm_classic_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,helm_hellaswag,helm_classic_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,helm_hellaswag,helm_classic_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,helm_hellaswag,helm_classic_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,helm_hellaswag,helm_classic_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,helm_hellaswag,helm_classic_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,helm_hellaswag,helm_classic_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,helm_hellaswag,helm_classic_240829.csv,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,helm_hellaswag,helm_classic_240829.csv,kendall,random,10,8,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,helm_hellaswag,helm_classic_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,2,0.2500645911391736,0.3212772414140661 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,4,0.26967994498529685,0.2811980995641792 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,7,0.4494665749754947,0.07248608508684644 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,8,0.15555555555555553,0.6006536596119929 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,9,0.29553088043720516,0.24112859961644273 aggregate,aggregate,helm_truthfulqa,helm_classic_240829.csv,kendall,random,10,0,-0.06666666666666667,0.8618005952380953 aggregate,aggregate,helm_truthfulqa,helm_classic_240829.csv,kendall,random,10,1,0.19999999999999998,0.4843127204585538 aggregate,aggregate,helm_truthfulqa,helm_classic_240829.csv,kendall,random,10,2,0.06666666666666667,0.8618005952380953 aggregate,aggregate,helm_truthfulqa,helm_classic_240829.csv,kendall,random,10,3,0.3333333333333333,0.21637345679012346 aggregate,aggregate,helm_truthfulqa,helm_classic_240829.csv,kendall,random,10,4,0.31462660248284624,0.2086677876982641 aggregate,aggregate,helm_truthfulqa,helm_classic_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,helm_truthfulqa,helm_classic_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207 aggregate,aggregate,helm_truthfulqa,helm_classic_240829.csv,kendall,random,10,7,-0.13483997249264842,0.590013887163346 aggregate,aggregate,helm_truthfulqa,helm_classic_240829.csv,kendall,random,10,8,0.3595732599803958,0.15076277502664528 aggregate,aggregate,helm_truthfulqa,helm_classic_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm MSMARCO Regular,helm_classic_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm MSMARCO Regular,helm_classic_240829.csv,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm MSMARCO Regular,helm_classic_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm MSMARCO Regular,helm_classic_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm MSMARCO Regular,helm_classic_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm MSMARCO Regular,helm_classic_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm MSMARCO Regular,helm_classic_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm MSMARCO Regular,helm_classic_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm MSMARCO Regular,helm_classic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm MSMARCO Regular,helm_classic_240829.csv,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,Helm MSMARCO Trec,helm_classic_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm MSMARCO Trec,helm_classic_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,Helm MSMARCO Trec,helm_classic_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Helm MSMARCO Trec,helm_classic_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm MSMARCO Trec,helm_classic_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm MSMARCO Trec,helm_classic_240829.csv,kendall,random,10,5,-0.06666666666666667,0.8618005952380953 aggregate,aggregate,Helm MSMARCO Trec,helm_classic_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,Helm MSMARCO Trec,helm_classic_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm MSMARCO Trec,helm_classic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm MSMARCO Trec,helm_classic_240829.csv,kendall,random,10,9,0.3333333333333333,0.21637345679012346 aggregate,aggregate,helm_cnn/dailymail,helm_classic_240829.csv,kendall,random,10,0,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,helm_cnn/dailymail,helm_classic_240829.csv,kendall,random,10,1,0.8989331499509895,0.0003280163150135276 aggregate,aggregate,helm_cnn/dailymail,helm_classic_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,helm_cnn/dailymail,helm_classic_240829.csv,kendall,random,10,3,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,helm_cnn/dailymail,helm_classic_240829.csv,kendall,random,10,4,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,helm_cnn/dailymail,helm_classic_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 aggregate,aggregate,helm_cnn/dailymail,helm_classic_240829.csv,kendall,random,10,6,0.7360699299651778,0.0036714498212758067 aggregate,aggregate,helm_cnn/dailymail,helm_classic_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,helm_cnn/dailymail,helm_classic_240829.csv,kendall,random,10,8,0.7191465199607916,0.004057136032371292 aggregate,aggregate,helm_cnn/dailymail,helm_classic_240829.csv,kendall,random,10,9,0.7047274841194893,0.005187148855929351 aggregate,aggregate,Helm XSUM,helm_classic_240829.csv,kendall,random,10,0,0.4494665749754947,0.07248608508684644 aggregate,aggregate,Helm XSUM,helm_classic_240829.csv,kendall,random,10,1,0.15555555555555553,0.6006536596119929 aggregate,aggregate,Helm XSUM,helm_classic_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Helm XSUM,helm_classic_240829.csv,kendall,random,10,3,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Helm XSUM,helm_classic_240829.csv,kendall,random,10,4,0.49441323247304414,0.048193488293190756 aggregate,aggregate,Helm XSUM,helm_classic_240829.csv,kendall,random,10,5,0.5843065474681431,0.019550269092885535 aggregate,aggregate,Helm XSUM,helm_classic_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm XSUM,helm_classic_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466 aggregate,aggregate,Helm XSUM,helm_classic_240829.csv,kendall,random,10,8,0.15555555555555553,0.6006536596119929 aggregate,aggregate,Helm XSUM,helm_classic_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,0,0.15555555555555553,0.6006536596119929 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,1,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,5,-0.022222222222222223,1.0 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,6,0.6137949055234262,0.01491204965521593 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,8,0.15555555555555553,0.6006536596119929 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,9,0.26967994498529685,0.2811980995641792 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,1,0.8989331499509894,0.0003280163150135276 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,2,0.26967994498529685,0.2811980995641792 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,8,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,9,0.40451991747794525,0.1059975484249457 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,2,0.5843065474681431,0.019550269092885535 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,3,0.37777777777777777,0.1557418430335097 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,4,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,2,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,1,0.6292532049656926,0.011921001496914019 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,3,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,4,0.4494665749754947,0.07248608508684644 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,6,0.6292532049656926,0.011921001496914019 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,2,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,4,0.24444444444444444,0.38071979717813054 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,7,0.19999999999999998,0.4843127204585538 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,8,0.3333333333333333,0.21637345679012346 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,0,0.6292532049656926,0.011921001496914019 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,1,0.5393598899705937,0.03114121059579671 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,3,0.37777777777777777,0.1557418430335097 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,9,0.3333333333333333,0.21637345679012346 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,0,0.7640931774583409,0.002263469812035174 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,1,0.49441323247304414,0.048193488293190756 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,2,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,6,0.6292532049656926,0.011921001496914019 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,9,0.4222222222222222,0.10831349206349207 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,2,0.5843065474681431,0.019550269092885535 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,3,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,4,0.2500645911391736,0.3212772414140661 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,6,0.522862326927363,0.03809415806109578 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,0,0.24444444444444444,0.38071979717813054 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,1,0.28888888888888886,0.2912483465608466 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,4,0.24444444444444444,0.38071979717813054 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,6,0.5393598899705937,0.03114121059579671 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,8,0.24444444444444444,0.38071979717813054 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,0,0.49441323247304414,0.048193488293190756 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,1,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,5,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,9,0.40451991747794525,0.1059975484249457 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,4,0.4222222222222222,0.10831349206349207 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,7,0.5843065474681431,0.019550269092885535 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,8,0.4494665749754947,0.07248608508684644 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,9,0.15555555555555553,0.6006536596119929 aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,0,, aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,1,, aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,2,, aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,3,, aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,4,, aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,5,, aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,6,, aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,7,, aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,8,, aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,9,, aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,0,0.6292532049656925,0.011921001496914019 aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,1,0.6292532049656925,0.011921001496914019 aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,2,0.764093177458341,0.002263469812035174 aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,3,0.5393598899705937,0.03114121059579671 aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,4,0.5393598899705937,0.03114121059579671 aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,5,0.7191465199607916,0.004057136032371292 aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,6,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,8,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,9,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,0,0.7191465199607916,0.004057136032371292 aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,4,0.6292532049656925,0.011921001496914019 aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,8,0.7191465199607916,0.004057136032371292 aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,9,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,1,0.6292532049656925,0.011921001496914019 aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,3,0.5393598899705937,0.03114121059579671 aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,6,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,0,0.5843065474681431,0.019550269092885535 aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,3,0.4944132324730442,0.048193488293190756 aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,4,0.3333333333333333,0.21637345679012346 aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,5,0.6292532049656925,0.011921001496914019 aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,7,0.5843065474681431,0.019550269092885535 aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,9,0.6292532049656925,0.011921001496914019 aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,0,0.2530855341217655,0.3648210653487137 aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,1,0.32539568672798425,0.2439639465303466 aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,2,0.2530855341217655,0.3648210653487137 aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,3,0.32539568672798425,0.2439639465303466 aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,4,0.4700159919404217,0.09238008162314501 aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,5,0.32539568672798425,0.2439639465303466 aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,6,0.24845199749997662,0.384088249473852 aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,7,0.1807753815155468,0.517439239336394 aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,8,0.39770583933420295,0.15443051915129236 aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,9,0.32539568672798425,0.2439639465303466 aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,0,0.37796447300922725,0.14954135458461507 aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,1,0.37796447300922725,0.14954135458461507 aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,2,0.3810317377662721,0.15446576442708193 aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,3,0.5967623950328607,0.020456721550759976 aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,4,0.629940788348712,0.016309171877754967 aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,5,0.6445033866354896,0.012304364739182175 aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,6,0.4354648431614539,0.10366227301829671 aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,7,0.35805743701971643,0.16431086269141681 aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,8,0.4354648431614539,0.10366227301829671 aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,9,0.6445033866354896,0.012304364739182175 aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,0,0.7191465199607916,0.004057136032371292 aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,1,0.764093177458341,0.002263469812035174 aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,2,0.764093177458341,0.002263469812035174 aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,3,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,4,0.5393598899705937,0.03114121059579671 aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,5,0.764093177458341,0.002263469812035174 aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,6,0.6900655593423543,0.006458954266892998 aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,8,0.6292532049656925,0.011921001496914019 aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,9,0.764093177458341,0.002263469812035174 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,3,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,9,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,0,0.35957325998039574,0.15076277502664528 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,0,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,9,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,0,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,1,0.40451991747794525,0.1059975484249457 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,2,0.7640931774583409,0.002263469812035174 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,3,0.5843065474681431,0.019550269092885535 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,4,0.5843065474681431,0.019550269092885535 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,6,0.6292532049656926,0.011921001496914019 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,0,0.24444444444444444,0.38071979717813054 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,2,0.7640931774583409,0.002263469812035174 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,5,0.6292532049656926,0.011921001496914019 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,6,0.7640931774583409,0.002263469812035174 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,7,0.9438798074485389,0.0001621317520439264 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,0,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,5,0.5393598899705937,0.03114121059579671 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,5,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,0,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,2,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,3,0.24444444444444444,0.38071979717813054 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,4,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,5,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,6,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,7,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,9,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,0,0.49441323247304414,0.048193488293190756 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,1,0.6292532049656926,0.011921001496914019 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,3,0.19999999999999998,0.4843127204585538 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,4,0.5843065474681431,0.019550269092885535 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,5,0.6292532049656926,0.011921001496914019 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,9,0.5393598899705937,0.03114121059579671 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,8,0.841126352013584,0.0008493886834182052 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,9,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,0,0.28888888888888886,0.2912483465608466 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,5,0.4222222222222222,0.10831349206349207 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,4,0.9438798074485389,0.0001621317520439264 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,3,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,5,0.37777777777777777,0.1557418430335097 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,7,0.49441323247304414,0.048193488293190756 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,5,0.37777777777777777,0.1557418430335097 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,8,0.4222222222222222,0.10831349206349207 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,4,0.4222222222222222,0.10831349206349207 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,5,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,4,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,10,0,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,10,1,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,10,2,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,10,3,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,10,4,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,10,5,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,10,6,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,10,7,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,10,8,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,10,9,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.6292532049656926,0.011921001496914019 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.24444444444444444,0.38071979717813054 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.28888888888888886,0.2912483465608466 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.15555555555555553,0.6006536596119929 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.28888888888888886,0.2912483465608466 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.4494665749754947,0.07248608508684644 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,0,0.06666666666666667,0.8618005952380953 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,4,0.15555555555555553,0.6006536596119929 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,5,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,6,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,8,0.19999999999999998,0.4843127204585538 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,0,0.20459830184114206,0.4170770595205646 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,4,0.17978662999019787,0.47249761400068846 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,5,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,6,0.4494665749754947,0.07248608508684644 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,8,0.19999999999999998,0.4843127204585538 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,0,0.1111111111111111,0.7274895282186948 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,1,0.49441323247304414,0.048193488293190756 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,2,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,3,0.4494665749754947,0.07248608508684644 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,4,0.19999999999999998,0.4843127204585538 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,5,0.3146266024828463,0.2086677876982641 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,6,0.24444444444444444,0.38071979717813054 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,7,0.24444444444444444,0.38071979717813054 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,8,0.22473328748774735,0.36917141633269157 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,9,0.6292532049656926,0.011921001496914019 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,0,0.08989331499509894,0.7194375444233914 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,1,0.6292532049656926,0.011921001496914019 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,3,0.5393598899705937,0.03114121059579671 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,4,0.17978662999019787,0.47249761400068846 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,6,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,8,0.13483997249264842,0.590013887163346 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,9,0.7047274841194893,0.005187148855929351 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,0,0.24444444444444444,0.38071979717813054 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,1,0.5843065474681431,0.019550269092885535 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,2,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,3,0.49441323247304414,0.048193488293190756 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,4,0.19999999999999998,0.4843127204585538 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,6,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,0,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,2,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,4,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,8,0.22473328748774735,0.36917141633269157 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,0,0.04494665749754947,0.8574624419592412 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,1,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,2,0.1111111111111111,0.7274895282186948 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,3,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,4,0.1111111111111111,0.7274895282186948 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,5,0.24444444444444444,0.38071979717813054 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,6,0.08989331499509894,0.7194375444233914 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,7,0.13483997249264842,0.590013887163346 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,8,0.1111111111111111,0.7274895282186948 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,0,0.06666666666666667,0.8618005952380953 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,1,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,2,0.17978662999019787,0.47249761400068846 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,4,0.1111111111111111,0.7274895282186948 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,6,0.24444444444444444,0.38071979717813054 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,7,0.24444444444444444,0.38071979717813054 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,8,0.1111111111111111,0.7274895282186948 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,0,0.22473328748774735,0.36917141633269157 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,4,0.15555555555555553,0.6006536596119929 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,5,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,8,0.19999999999999998,0.4843127204585538 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,0,0.5393598899705937,0.03114121059579671 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,1,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,2,0.15555555555555553,0.6006536596119929 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,5,0.5393598899705937,0.03114121059579671 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,7,0.3146266024828463,0.2086677876982641 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,8,0.08989331499509894,0.7194375444233914 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,9,0.7640931774583409,0.002263469812035174 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,1,0.7956600627155523,0.0016003458153453507 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,2,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,3,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,4,0.7640931774583409,0.002263469812035174 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,5,0.7191465199607915,0.004057136032371292 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,7,0.7191465199607915,0.004057136032371292 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,1,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,2,0.6440611887195306,0.01102341638822145 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,3,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,4,0.8989331499509894,0.0003280163150135276 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,8,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,9,0.5843065474681431,0.019550269092885535 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,2,0.7047274841194893,0.005187148855929351 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,4,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829