scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9438798074485389,0.0001621317520439264 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.9999999999999999,5.511463844797178e-07 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.19999999999999998,0.4843127204585538 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4222222222222222,0.10831349206349207 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.15555555555555553,0.6006536596119929 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.24444444444444444,0.38071979717813054 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.28888888888888886,0.2912483465608466 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,-0.1111111111111111,0.7274895282186948 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.37777777777777777,0.1557418430335097 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.24444444444444444,0.38071979717813054 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.28888888888888886,0.2912483465608466 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.24444444444444444,0.38071979717813054 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607915,0.004057136032371292 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.24444444444444444,0.38071979717813054 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.3333333333333333,0.21637345679012346 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8090398349558905,0.0012254240706707103 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7191465199607915,0.004057136032371292 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7191465199607915,0.004057136032371292 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.40451991747794525,0.1059975484249457 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6292532049656926,0.011921001496914019 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7191465199607915,0.004057136032371292 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.35957325998039574,0.15076277502664528 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8090398349558905,0.0012254240706707103 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8865926413116155,0.000437257592706733 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7640931774583409,0.002263469812035174 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8865926413116155,0.000437257592706733 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7071067811865475,0.006028046181872152 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7956600627155523,0.0016003458153453507 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.828078671210825,0.001082228864258374 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9320589306096472,0.00021829510003985143 HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8989331499509894,0.0003280163150135276 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,-0.15555555555555553,0.6006536596119929 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.37777777777777777,0.1557418430335097 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4666666666666666,0.07255015432098766 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.15555555555555553,0.6006536596119929 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,-0.1111111111111111,0.7274895282186948 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,-0.19999999999999998,0.4843127204585538 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4222222222222222,0.10831349206349207 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6592611948214576,0.008926875535053643 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.37777777777777777,0.1557418430335097 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5843065474681431,0.019550269092885535 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.19999999999999998,0.4843127204585538 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.37777777777777777,0.1557418430335097 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.3333333333333333,0.21637345679012346 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.24444444444444444,0.38071979717813054 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.24444444444444444,0.38071979717813054 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.3333333333333333,0.21637345679012346 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.24444444444444444,0.38071979717813054 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.19999999999999998,0.4843127204585538 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.28888888888888886,0.2912483465608466 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.3333333333333333,0.21637345679012346 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.28888888888888886,0.2912483465608466 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.28888888888888886,0.2912483465608466 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.3333333333333333,0.21637345679012346 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.28888888888888886,0.2912483465608466 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.37777777777777777,0.1557418430335097 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.37777777777777777,0.1557418430335097 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4666666666666666,0.07255015432098766 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5393598899705937,0.03114121059579671 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6292532049656925,0.011921001496914019 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,0,0.31462660248284624,0.2086677876982641 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5393598899705937,0.03114121059579671 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5393598899705937,0.03114121059579671 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,8,0.13483997249264842,0.590013887163346 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9555555555555554,5.5114638447971785e-06 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4222222222222222,0.10831349206349207 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8090398349558905,0.0012254240706707103 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7640931774583409,0.002263469812035174 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5843065474681431,0.019550269092885535 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7640931774583409,0.002263469812035174 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607915,0.004057136032371292 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7191465199607915,0.004057136032371292 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6137949055234262,0.01491204965521593 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6292532049656926,0.011921001496914019 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6592611948214576,0.008926875535053643 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5683286162253945,0.02418521832271127 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4600437062282361,0.06941907499936387 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5980568180967069,0.018255860587459292 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6592611948214576,0.008926875535053643 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.49441323247304414,0.048193488293190756 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.49441323247304414,0.048193488293190756 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.49441323247304414,0.048193488293190756 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.49441323247304414,0.048193488293190756 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.49441323247304414,0.048193488293190756 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.49441323247304414,0.048193488293190756 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.49441323247304414,0.048193488293190756 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.49441323247304414,0.048193488293190756 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.49441323247304414,0.048193488293190756 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.49441323247304414,0.048193488293190756 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8539864924534399,0.0006436975254696865 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7640931774583409,0.002263469812035174 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8090398349558905,0.0012254240706707103 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.37777777777777777,0.1557418430335097 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7191465199607915,0.004057136032371292 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7191465199607915,0.004057136032371292 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7640931774583409,0.002263469812035174 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7191465199607915,0.004057136032371292 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8989331499509894,0.0003280163150135276 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6292532049656926,0.011921001496914019 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.49441323247304414,0.048193488293190756 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7191465199607915,0.004057136032371292 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6741998624632421,0.0070583320485280866 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5393598899705937,0.03114121059579671 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5060480768510597,0.04639449764906707 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6292532049656926,0.011921001496914019 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.35957325998039574,0.15076277502664528 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9888264649460883,7.772240795323086e-05 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6741998624632421,0.0070583320485280866 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7191465199607915,0.004057136032371292 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8090398349558905,0.0012254240706707103 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6292532049656926,0.011921001496914019 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4494665749754947,0.07248608508684644 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8090398349558905,0.0012254240706707103 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5683286162253945,0.02418521832271127 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607915,0.004057136032371292 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.522862326927363,0.03809415806109578 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.3333333333333333,0.21637345679012346 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.28888888888888886,0.2912483465608466 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4222222222222222,0.10831349206349207 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4222222222222222,0.10831349206349207 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.3333333333333333,0.21637345679012346 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.37777777777777777,0.1557418430335097 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.37777777777777777,0.1557418430335097 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.19999999999999998,0.4843127204585538 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.37777777777777777,0.1557418430335097 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4222222222222222,0.10831349206349207 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.37777777777777777,0.1557418430335097 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.37777777777777777,0.1557418430335097 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7640931774583409,0.002263469812035174 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8539864924534399,0.0006436975254696865 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5393598899705937,0.03114121059579671 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3146266024828463,0.2086677876982641 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.40451991747794525,0.1059975484249457 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4222222222222222,0.10831349206349207 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4222222222222222,0.10831349206349207 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.3333333333333333,0.21637345679012346 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.37777777777777777,0.1557418430335097 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.24444444444444444,0.38071979717813054 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.3333333333333333,0.21637345679012346 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6741998624632421,0.0070583320485280866 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9438798074485389,0.0001621317520439264 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.49441323247304414,0.048193488293190756 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8989331499509894,0.0003280163150135276 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,0,0.31462660248284624,0.2086677876982641 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5393598899705937,0.03114121059579671 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5393598899705937,0.03114121059579671 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,8,0.13483997249264842,0.590013887163346 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.3333333333333333,0.21637345679012346 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.28888888888888886,0.2912483465608466 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.24444444444444444,0.38071979717813054 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,-0.022222222222222223,1.0 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.37777777777777777,0.1557418430335097 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.4222222222222222,0.10831349206349207 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.3333333333333333,0.21637345679012346 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.24444444444444444,0.38071979717813054 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.15555555555555553,0.6006536596119929 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.28888888888888886,0.2912483465608466 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466 BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4222222222222222,0.10831349206349207 BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.40451991747794525,0.1059975484249457 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6292532049656926,0.011921001496914019 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.37777777777777777,0.1557418430335097 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.2500645911391736,0.3212772414140661 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.37777777777777777,0.1557418430335097 BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.49441323247304414,0.048193488293190756 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6741998624632421,0.0070583320485280866 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.3333333333333333,0.21637345679012346 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.37777777777777777,0.1557418430335097 BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.37777777777777777,0.1557418430335097 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8090398349558905,0.0012254240706707103 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5393598899705937,0.03114121059579671 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466 BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.49441323247304414,0.048193488293190756 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.37777777777777777,0.1557418430335097 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.24444444444444444,0.38071979717813054 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3146266024828463,0.2086677876982641 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4666666666666666,0.07255015432098766 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.3333333333333333,0.21637345679012346 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.24444444444444444,0.38071979717813054 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466 BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.24444444444444444,0.38071979717813054 BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8539864924534399,0.0006436975254696865 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5393598899705937,0.03114121059579671 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5393598899705937,0.03114121059579671 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6292532049656926,0.011921001496914019 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5843065474681431,0.019550269092885535 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6292532049656926,0.011921001496914019 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607915,0.004057136032371292 BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7191465199607915,0.004057136032371292 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.3333333333333333,0.21637345679012346 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.7191465199607915,0.004057136032371292 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.5520524474738833,0.029345108174841844 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6137949055234262,0.01491204965521593 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7191465199607915,0.004057136032371292 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7191465199607915,0.004057136032371292 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.4494665749754947,0.07248608508684644 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.8539864924534399,0.0006436975254696865 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.7640931774583409,0.002263469812035174 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.3333333333333333,0.21637345679012346 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.24444444444444444,0.38071979717813054 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,2,0.9438798074485389,0.0001621317520439264 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,5,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,0,0.19999999999999998,0.4843127204585538 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,1,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,2,0.15555555555555553,0.6006536596119929 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,3,0.24444444444444444,0.38071979717813054 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,4,0.28888888888888886,0.2912483465608466 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,5,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,9,0.37777777777777777,0.1557418430335097 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,1,0.24444444444444444,0.38071979717813054 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,2,0.28888888888888886,0.2912483465608466 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,5,0.24444444444444444,0.38071979717813054 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,8,0.7191465199607915,0.004057136032371292 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,1,0.24444444444444444,0.38071979717813054 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,3,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,1,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,2,0.7191465199607915,0.004057136032371292 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,3,0.7191465199607915,0.004057136032371292 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,0,0.40451991747794525,0.1059975484249457 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,6,0.6292532049656926,0.011921001496914019 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,3,0.7191465199607915,0.004057136032371292 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,3,0.35957325998039574,0.15076277502664528 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,6,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.8865926413116155,0.000437257592706733 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.7640931774583409,0.002263469812035174 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.8865926413116155,0.000437257592706733 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.7071067811865475,0.006028046181872152 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.7956600627155523,0.0016003458153453507 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.828078671210825,0.001082228864258374 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.9320589306096472,0.00021829510003985143 aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.8989331499509894,0.0003280163150135276 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,0,-0.15555555555555553,0.6006536596119929 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.15555555555555553,0.6006536596119929 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,6,-0.1111111111111111,0.7274895282186948 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,7,-0.19999999999999998,0.4843127204585538 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.6592611948214576,0.008926875535053643 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.5843065474681431,0.019550269092885535 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.19999999999999998,0.4843127204585538 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.24444444444444444,0.38071979717813054 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.24444444444444444,0.38071979717813054 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,0,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,1,0.24444444444444444,0.38071979717813054 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,2,0.19999999999999998,0.4843127204585538 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,3,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,4,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,5,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,6,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,8,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,9,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,5,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,9,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,2,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,1,0.5393598899705937,0.03114121059579671 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,6,0.6292532049656925,0.011921001496914019 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,0,0.31462660248284624,0.2086677876982641 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,2,0.5393598899705937,0.03114121059579671 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,3,0.5393598899705937,0.03114121059579671 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,8,0.13483997249264842,0.590013887163346 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,0,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,1,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,1,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,4,0.7640931774583409,0.002263469812035174 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,5,0.5843065474681431,0.019550269092885535 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,7,0.7640931774583409,0.002263469812035174 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,8,0.7191465199607915,0.004057136032371292 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,0,0.7191465199607915,0.004057136032371292 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,1,0.6137949055234262,0.01491204965521593 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,2,0.6292532049656926,0.011921001496914019 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,3,0.6592611948214576,0.008926875535053643 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,5,0.5683286162253945,0.02418521832271127 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,6,0.4600437062282361,0.06941907499936387 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,7,0.5980568180967069,0.018255860587459292 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,9,0.6592611948214576,0.008926875535053643 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,0,0.49441323247304414,0.048193488293190756 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,1,0.49441323247304414,0.048193488293190756 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,2,0.49441323247304414,0.048193488293190756 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,3,0.49441323247304414,0.048193488293190756 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,4,0.49441323247304414,0.048193488293190756 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,5,0.49441323247304414,0.048193488293190756 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,6,0.49441323247304414,0.048193488293190756 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,7,0.49441323247304414,0.048193488293190756 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,8,0.49441323247304414,0.048193488293190756 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,9,0.49441323247304414,0.048193488293190756 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,6,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,2,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,6,0.7640931774583409,0.002263469812035174 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,9,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,1,0.37777777777777777,0.1557418430335097 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,4,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,6,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,8,0.7640931774583409,0.002263469812035174 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,9,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,2,0.8989331499509894,0.0003280163150135276 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,4,0.6292532049656926,0.011921001496914019 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,0,0.49441323247304414,0.048193488293190756 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,1,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,3,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,4,0.5393598899705937,0.03114121059579671 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,0,0.5060480768510597,0.04639449764906707 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,1,0.6292532049656926,0.011921001496914019 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,4,0.35957325998039574,0.15076277502664528 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,8,0.9888264649460883,7.772240795323086e-05 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,9,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,6,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,9,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,0,0.6292532049656926,0.011921001496914019 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,2,0.4494665749754947,0.07248608508684644 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,3,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,6,0.5683286162253945,0.02418521832271127 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,8,0.7191465199607915,0.004057136032371292 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,9,0.522862326927363,0.03809415806109578 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,1,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,2,0.28888888888888886,0.2912483465608466 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,8,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,9,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,0,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,1,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,3,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,4,0.19999999999999998,0.4843127204585538 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,7,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,8,0.4222222222222222,0.10831349206349207 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,9,0.37777777777777777,0.1557418430335097 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,5,0.37777777777777777,0.1557418430335097 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,1,0.7640931774583409,0.002263469812035174 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,3,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,4,0.5393598899705937,0.03114121059579671 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,5,0.3146266024828463,0.2086677876982641 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,9,0.40451991747794525,0.1059975484249457 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,4,0.4222222222222222,0.10831349206349207 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,9,0.4222222222222222,0.10831349206349207 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,4,0.3333333333333333,0.21637345679012346 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,9,0.37777777777777777,0.1557418430335097 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,4,0.24444444444444444,0.38071979717813054 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,0,0.3333333333333333,0.21637345679012346 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,0,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,2,0.9438798074485389,0.0001621317520439264 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,0,0.49441323247304414,0.048193488293190756 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,2,0.8989331499509894,0.0003280163150135276 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,0,0.31462660248284624,0.2086677876982641 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,2,0.5393598899705937,0.03114121059579671 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,3,0.5393598899705937,0.03114121059579671 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,8,0.13483997249264842,0.590013887163346 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.28888888888888886,0.2912483465608466 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.24444444444444444,0.38071979717813054 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,-0.022222222222222223,1.0 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.3333333333333333,0.21637345679012346 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.24444444444444444,0.38071979717813054 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.15555555555555553,0.6006536596119929 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.28888888888888886,0.2912483465608466 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.6,0.016666115520282188 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,8,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,1,0.40451991747794525,0.1059975484249457 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,4,0.6292532049656926,0.011921001496914019 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,6,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,7,0.2500645911391736,0.3212772414140661 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,8,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,1,0.49441323247304414,0.048193488293190756 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,5,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,6,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,8,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,7,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,5,0.8090398349558905,0.0012254240706707103 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,6,0.5393598899705937,0.03114121059579671 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,2,0.49441323247304414,0.048193488293190756 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,3,0.37777777777777777,0.1557418430335097 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,6,0.24444444444444444,0.38071979717813054 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,7,0.3146266024828463,0.2086677876982641 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,2,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,6,0.3333333333333333,0.21637345679012346 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,7,0.24444444444444444,0.38071979717813054 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,8,0.24444444444444444,0.38071979717813054 aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,0,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,1,0.5393598899705937,0.03114121059579671 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,2,0.6741998624632421,0.0070583320485280866 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,3,0.5393598899705937,0.03114121059579671 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,4,0.6292532049656926,0.011921001496914019 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,6,0.5843065474681431,0.019550269092885535 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,7,0.6292532049656926,0.011921001496914019 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,8,0.7191465199607915,0.004057136032371292 aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,9,0.7191465199607915,0.004057136032371292 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,8,0.3333333333333333,0.21637345679012346 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,0,0.7191465199607915,0.004057136032371292 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,2,0.5520524474738833,0.029345108174841844 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,3,0.6137949055234262,0.01491204965521593 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,4,0.7191465199607915,0.004057136032371292 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,5,0.7191465199607915,0.004057136032371292 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,6,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,7,0.4494665749754947,0.07248608508684644 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,9,0.8539864924534399,0.0006436975254696865 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,1,0.7640931774583409,0.002263469812035174 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,8,0.3333333333333333,0.21637345679012346 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,1,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,5,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,0,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,3,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,7,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,8,0.24444444444444444,0.38071979717813054 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,2,0.6,0.016666115520282188 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755