benchbench / cache /agreements_cache_9c0710fd06d230cc89e0f2f023e3058f.csv
Yotam-Perlitz
revise cache
292d764
raw
history blame
239 kB
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9438798074485389,0.0001621317520439264
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.9999999999999999,5.511463844797178e-07
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.19999999999999998,0.4843127204585538
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4222222222222222,0.10831349206349207
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.15555555555555553,0.6006536596119929
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.24444444444444444,0.38071979717813054
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.28888888888888886,0.2912483465608466
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,-0.1111111111111111,0.7274895282186948
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.37777777777777777,0.1557418430335097
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.24444444444444444,0.38071979717813054
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.28888888888888886,0.2912483465608466
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.24444444444444444,0.38071979717813054
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607915,0.004057136032371292
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.24444444444444444,0.38071979717813054
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.3333333333333333,0.21637345679012346
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8090398349558905,0.0012254240706707103
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7191465199607915,0.004057136032371292
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7191465199607915,0.004057136032371292
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.40451991747794525,0.1059975484249457
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6292532049656926,0.011921001496914019
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7191465199607915,0.004057136032371292
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.35957325998039574,0.15076277502664528
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8090398349558905,0.0012254240706707103
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6741998624632421,0.0070583320485280866
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8865926413116155,0.000437257592706733
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7640931774583409,0.002263469812035174
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8865926413116155,0.000437257592706733
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7071067811865475,0.006028046181872152
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7956600627155523,0.0016003458153453507
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.828078671210825,0.001082228864258374
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9320589306096472,0.00021829510003985143
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8989331499509894,0.0003280163150135276
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,-0.15555555555555553,0.6006536596119929
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.37777777777777777,0.1557418430335097
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4666666666666666,0.07255015432098766
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.15555555555555553,0.6006536596119929
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,-0.1111111111111111,0.7274895282186948
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,-0.19999999999999998,0.4843127204585538
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4222222222222222,0.10831349206349207
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6592611948214576,0.008926875535053643
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.37777777777777777,0.1557418430335097
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5843065474681431,0.019550269092885535
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.19999999999999998,0.4843127204585538
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.37777777777777777,0.1557418430335097
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.3333333333333333,0.21637345679012346
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.24444444444444444,0.38071979717813054
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.24444444444444444,0.38071979717813054
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.3333333333333333,0.21637345679012346
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.24444444444444444,0.38071979717813054
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.19999999999999998,0.4843127204585538
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.28888888888888886,0.2912483465608466
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.3333333333333333,0.21637345679012346
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.28888888888888886,0.2912483465608466
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.28888888888888886,0.2912483465608466
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.3333333333333333,0.21637345679012346
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.28888888888888886,0.2912483465608466
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.37777777777777777,0.1557418430335097
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.37777777777777777,0.1557418430335097
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4666666666666666,0.07255015432098766
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5393598899705937,0.03114121059579671
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6292532049656925,0.011921001496914019
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,0,0.31462660248284624,0.2086677876982641
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5393598899705937,0.03114121059579671
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5393598899705937,0.03114121059579671
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,8,0.13483997249264842,0.590013887163346
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9555555555555554,5.5114638447971785e-06
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4222222222222222,0.10831349206349207
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8090398349558905,0.0012254240706707103
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7640931774583409,0.002263469812035174
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5843065474681431,0.019550269092885535
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7640931774583409,0.002263469812035174
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607915,0.004057136032371292
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7191465199607915,0.004057136032371292
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6137949055234262,0.01491204965521593
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6292532049656926,0.011921001496914019
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6592611948214576,0.008926875535053643
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5683286162253945,0.02418521832271127
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4600437062282361,0.06941907499936387
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5980568180967069,0.018255860587459292
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6592611948214576,0.008926875535053643
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.49441323247304414,0.048193488293190756
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.49441323247304414,0.048193488293190756
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.49441323247304414,0.048193488293190756
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.49441323247304414,0.048193488293190756
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.49441323247304414,0.048193488293190756
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.49441323247304414,0.048193488293190756
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.49441323247304414,0.048193488293190756
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.49441323247304414,0.048193488293190756
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.49441323247304414,0.048193488293190756
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.49441323247304414,0.048193488293190756
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9999999999999999,5.511463844797178e-07
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8539864924534399,0.0006436975254696865
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7640931774583409,0.002263469812035174
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8090398349558905,0.0012254240706707103
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.37777777777777777,0.1557418430335097
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7191465199607915,0.004057136032371292
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7191465199607915,0.004057136032371292
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7640931774583409,0.002263469812035174
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7191465199607915,0.004057136032371292
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8989331499509894,0.0003280163150135276
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6292532049656926,0.011921001496914019
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.49441323247304414,0.048193488293190756
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7191465199607915,0.004057136032371292
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6741998624632421,0.0070583320485280866
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6741998624632421,0.0070583320485280866
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5393598899705937,0.03114121059579671
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6741998624632421,0.0070583320485280866
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5060480768510597,0.04639449764906707
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6292532049656926,0.011921001496914019
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.35957325998039574,0.15076277502664528
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9888264649460883,7.772240795323086e-05
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6741998624632421,0.0070583320485280866
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7191465199607915,0.004057136032371292
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8090398349558905,0.0012254240706707103
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6292532049656926,0.011921001496914019
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4494665749754947,0.07248608508684644
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8090398349558905,0.0012254240706707103
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5683286162253945,0.02418521832271127
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607915,0.004057136032371292
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.522862326927363,0.03809415806109578
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.3333333333333333,0.21637345679012346
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.28888888888888886,0.2912483465608466
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4222222222222222,0.10831349206349207
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4222222222222222,0.10831349206349207
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,0,0.3333333333333333,0.21637345679012346
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,1,0.37777777777777777,0.1557418430335097
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,3,0.37777777777777777,0.1557418430335097
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,4,0.19999999999999998,0.4843127204585538
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,7,0.37777777777777777,0.1557418430335097
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4222222222222222,0.10831349206349207
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,10,9,0.37777777777777777,0.1557418430335097
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.37777777777777777,0.1557418430335097
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7640931774583409,0.002263469812035174
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8539864924534399,0.0006436975254696865
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5393598899705937,0.03114121059579671
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3146266024828463,0.2086677876982641
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.40451991747794525,0.1059975484249457
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4222222222222222,0.10831349206349207
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4222222222222222,0.10831349206349207
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.3333333333333333,0.21637345679012346
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.37777777777777777,0.1557418430335097
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.24444444444444444,0.38071979717813054
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.3333333333333333,0.21637345679012346
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6741998624632421,0.0070583320485280866
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9438798074485389,0.0001621317520439264
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.49441323247304414,0.048193488293190756
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8989331499509894,0.0003280163150135276
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,0,0.31462660248284624,0.2086677876982641
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5393598899705937,0.03114121059579671
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5393598899705937,0.03114121059579671
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,8,0.13483997249264842,0.590013887163346
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.3333333333333333,0.21637345679012346
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.28888888888888886,0.2912483465608466
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.24444444444444444,0.38071979717813054
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,-0.022222222222222223,1.0
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.37777777777777777,0.1557418430335097
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.4222222222222222,0.10831349206349207
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.3333333333333333,0.21637345679012346
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.24444444444444444,0.38071979717813054
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.15555555555555553,0.6006536596119929
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.28888888888888886,0.2912483465608466
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4222222222222222,0.10831349206349207
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.40451991747794525,0.1059975484249457
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6292532049656926,0.011921001496914019
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.37777777777777777,0.1557418430335097
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.2500645911391736,0.3212772414140661
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.37777777777777777,0.1557418430335097
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.49441323247304414,0.048193488293190756
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6741998624632421,0.0070583320485280866
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.3333333333333333,0.21637345679012346
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.37777777777777777,0.1557418430335097
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.37777777777777777,0.1557418430335097
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8090398349558905,0.0012254240706707103
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5393598899705937,0.03114121059579671
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.49441323247304414,0.048193488293190756
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.37777777777777777,0.1557418430335097
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.24444444444444444,0.38071979717813054
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3146266024828463,0.2086677876982641
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4666666666666666,0.07255015432098766
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.3333333333333333,0.21637345679012346
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.24444444444444444,0.38071979717813054
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4222222222222222,0.10831349206349207
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.28888888888888886,0.2912483465608466
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.24444444444444444,0.38071979717813054
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8539864924534399,0.0006436975254696865
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5393598899705937,0.03114121059579671
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6741998624632421,0.0070583320485280866
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5393598899705937,0.03114121059579671
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6292532049656926,0.011921001496914019
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5843065474681431,0.019550269092885535
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6292532049656926,0.011921001496914019
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7191465199607915,0.004057136032371292
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7191465199607915,0.004057136032371292
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.3333333333333333,0.21637345679012346
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.7191465199607915,0.004057136032371292
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.5520524474738833,0.029345108174841844
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6137949055234262,0.01491204965521593
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7191465199607915,0.004057136032371292
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7191465199607915,0.004057136032371292
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.4494665749754947,0.07248608508684644
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.8539864924534399,0.0006436975254696865
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.7640931774583409,0.002263469812035174
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.3333333333333333,0.21637345679012346
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.24444444444444444,0.38071979717813054
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,2,0.9438798074485389,0.0001621317520439264
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,5,0.9999999999999999,5.511463844797178e-07
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,0,0.19999999999999998,0.4843127204585538
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,1,0.4222222222222222,0.10831349206349207
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,2,0.15555555555555553,0.6006536596119929
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,3,0.24444444444444444,0.38071979717813054
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,4,0.28888888888888886,0.2912483465608466
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,5,-0.1111111111111111,0.7274895282186948
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,9,0.37777777777777777,0.1557418430335097
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,1,0.24444444444444444,0.38071979717813054
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,2,0.28888888888888886,0.2912483465608466
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,5,0.24444444444444444,0.38071979717813054
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,8,0.7191465199607915,0.004057136032371292
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,1,0.24444444444444444,0.38071979717813054
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,3,0.3333333333333333,0.21637345679012346
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,6,0.6,0.016666115520282188
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,1,0.8090398349558905,0.0012254240706707103
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,2,0.7191465199607915,0.004057136032371292
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,3,0.7191465199607915,0.004057136032371292
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,4,0.6,0.016666115520282188
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,0,0.40451991747794525,0.1059975484249457
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,1,0.6,0.016666115520282188
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,9,0.6,0.016666115520282188
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,6,0.6292532049656926,0.011921001496914019
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,3,0.7191465199607915,0.004057136032371292
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,3,0.35957325998039574,0.15076277502664528
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,6,0.8090398349558905,0.0012254240706707103
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,7,0.6741998624632421,0.0070583320485280866
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.8865926413116155,0.000437257592706733
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.7640931774583409,0.002263469812035174
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.8865926413116155,0.000437257592706733
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.7071067811865475,0.006028046181872152
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.7956600627155523,0.0016003458153453507
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.828078671210825,0.001082228864258374
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.9320589306096472,0.00021829510003985143
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.8989331499509894,0.0003280163150135276
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,0,-0.15555555555555553,0.6006536596119929
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.37777777777777777,0.1557418430335097
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.4666666666666666,0.07255015432098766
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.15555555555555553,0.6006536596119929
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,6,-0.1111111111111111,0.7274895282186948
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,7,-0.19999999999999998,0.4843127204585538
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.4222222222222222,0.10831349206349207
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.6592611948214576,0.008926875535053643
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.6,0.016666115520282188
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.37777777777777777,0.1557418430335097
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.5843065474681431,0.019550269092885535
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.19999999999999998,0.4843127204585538
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.37777777777777777,0.1557418430335097
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.3333333333333333,0.21637345679012346
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.24444444444444444,0.38071979717813054
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.24444444444444444,0.38071979717813054
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,2,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,5,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,10,9,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,0,0.3333333333333333,0.21637345679012346
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,1,0.24444444444444444,0.38071979717813054
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,2,0.19999999999999998,0.4843127204585538
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,3,0.28888888888888886,0.2912483465608466
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,4,0.3333333333333333,0.21637345679012346
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,5,0.28888888888888886,0.2912483465608466
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,6,0.28888888888888886,0.2912483465608466
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,8,0.3333333333333333,0.21637345679012346
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,10,9,0.28888888888888886,0.2912483465608466
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,2,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,5,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,10,9,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,4,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,5,0.37777777777777777,0.1557418430335097
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,10,9,0.37777777777777777,0.1557418430335097
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,2,0.4666666666666666,0.07255015432098766
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,0,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,4,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,8,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,1,0.5393598899705937,0.03114121059579671
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,6,0.6292532049656925,0.011921001496914019
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,8,0.6,0.016666115520282188
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,0,0.31462660248284624,0.2086677876982641
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,2,0.5393598899705937,0.03114121059579671
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,3,0.5393598899705937,0.03114121059579671
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,8,0.13483997249264842,0.590013887163346
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,9,0.6,0.016666115520282188
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,0,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,1,0.4222222222222222,0.10831349206349207
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,7,0.6,0.016666115520282188
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,1,0.8090398349558905,0.0012254240706707103
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,4,0.7640931774583409,0.002263469812035174
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,5,0.5843065474681431,0.019550269092885535
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,7,0.7640931774583409,0.002263469812035174
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,8,0.7191465199607915,0.004057136032371292
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,1,0.6,0.016666115520282188
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,4,0.6,0.016666115520282188
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,6,0.6,0.016666115520282188
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,7,0.6,0.016666115520282188
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,8,0.6,0.016666115520282188
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,0,0.7191465199607915,0.004057136032371292
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,1,0.6137949055234262,0.01491204965521593
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,2,0.6292532049656926,0.011921001496914019
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,3,0.6592611948214576,0.008926875535053643
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,5,0.5683286162253945,0.02418521832271127
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,6,0.4600437062282361,0.06941907499936387
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,7,0.5980568180967069,0.018255860587459292
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,9,0.6592611948214576,0.008926875535053643
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,1,0.6,0.016666115520282188
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,0,0.49441323247304414,0.048193488293190756
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,1,0.49441323247304414,0.048193488293190756
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,2,0.49441323247304414,0.048193488293190756
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,3,0.49441323247304414,0.048193488293190756
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,4,0.49441323247304414,0.048193488293190756
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,5,0.49441323247304414,0.048193488293190756
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,6,0.49441323247304414,0.048193488293190756
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,7,0.49441323247304414,0.048193488293190756
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,8,0.49441323247304414,0.048193488293190756
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,9,0.49441323247304414,0.048193488293190756
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,6,0.9999999999999999,5.511463844797178e-07
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,2,0.8539864924534399,0.0006436975254696865
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,6,0.7640931774583409,0.002263469812035174
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,9,0.8090398349558905,0.0012254240706707103
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,1,0.37777777777777777,0.1557418430335097
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,4,0.7191465199607915,0.004057136032371292
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,6,0.7191465199607915,0.004057136032371292
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,8,0.7640931774583409,0.002263469812035174
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,9,0.7191465199607915,0.004057136032371292
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,2,0.8989331499509894,0.0003280163150135276
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,4,0.6292532049656926,0.011921001496914019
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,9,0.6,0.016666115520282188
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,0,0.49441323247304414,0.048193488293190756
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,1,0.7191465199607915,0.004057136032371292
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,2,0.6741998624632421,0.0070583320485280866
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,3,0.6741998624632421,0.0070583320485280866
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,4,0.5393598899705937,0.03114121059579671
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,7,0.6741998624632421,0.0070583320485280866
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,0,0.5060480768510597,0.04639449764906707
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,1,0.6292532049656926,0.011921001496914019
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,4,0.35957325998039574,0.15076277502664528
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,8,0.9888264649460883,7.772240795323086e-05
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,9,0.6741998624632421,0.0070583320485280866
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,2,0.6,0.016666115520282188
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,5,0.6,0.016666115520282188
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,6,0.7191465199607915,0.004057136032371292
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,9,0.8090398349558905,0.0012254240706707103
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,0,0.6292532049656926,0.011921001496914019
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,2,0.4494665749754947,0.07248608508684644
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,3,0.8090398349558905,0.0012254240706707103
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,6,0.5683286162253945,0.02418521832271127
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,8,0.7191465199607915,0.004057136032371292
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,9,0.522862326927363,0.03809415806109578
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,2,0.6,0.016666115520282188
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,4,0.6,0.016666115520282188
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,1,0.3333333333333333,0.21637345679012346
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,2,0.28888888888888886,0.2912483465608466
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,8,0.4222222222222222,0.10831349206349207
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,10,9,0.4222222222222222,0.10831349206349207
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,5,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,6,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,10,9,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,1,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,7,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,8,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,10,9,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,0,0.3333333333333333,0.21637345679012346
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,1,0.37777777777777777,0.1557418430335097
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,3,0.37777777777777777,0.1557418430335097
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,4,0.19999999999999998,0.4843127204585538
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,7,0.37777777777777777,0.1557418430335097
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,8,0.4222222222222222,0.10831349206349207
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,10,9,0.37777777777777777,0.1557418430335097
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,2,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,6,0.6,0.016666115520282188
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,0,0.6,0.016666115520282188
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,5,0.37777777777777777,0.1557418430335097
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,1,0.7640931774583409,0.002263469812035174
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,3,0.8539864924534399,0.0006436975254696865
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,4,0.5393598899705937,0.03114121059579671
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,5,0.3146266024828463,0.2086677876982641
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,9,0.40451991747794525,0.1059975484249457
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,0,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,2,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,8,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,9,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,0,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,4,0.4222222222222222,0.10831349206349207
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,7,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,9,0.4222222222222222,0.10831349206349207
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,2,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,4,0.3333333333333333,0.21637345679012346
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,9,0.37777777777777777,0.1557418430335097
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,3,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,4,0.24444444444444444,0.38071979717813054
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,1,0.6,0.016666115520282188
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,8,0.6,0.016666115520282188
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,0,0.3333333333333333,0.21637345679012346
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,7,0.6,0.016666115520282188
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,0,0.6741998624632421,0.0070583320485280866
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,2,0.9438798074485389,0.0001621317520439264
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,0,0.49441323247304414,0.048193488293190756
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,2,0.8989331499509894,0.0003280163150135276
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,0,0.31462660248284624,0.2086677876982641
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,2,0.5393598899705937,0.03114121059579671
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,3,0.5393598899705937,0.03114121059579671
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,8,0.13483997249264842,0.590013887163346
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,9,0.6,0.016666115520282188
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.3333333333333333,0.21637345679012346
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.6,0.016666115520282188
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.28888888888888886,0.2912483465608466
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.24444444444444444,0.38071979717813054
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,-0.022222222222222223,1.0
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.37777777777777777,0.1557418430335097
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.4222222222222222,0.10831349206349207
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.6,0.016666115520282188
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.3333333333333333,0.21637345679012346
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.6,0.016666115520282188
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.6,0.016666115520282188
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.24444444444444444,0.38071979717813054
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.15555555555555553,0.6006536596119929
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.28888888888888886,0.2912483465608466
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.6,0.016666115520282188
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6,0.016666115520282188
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,4,0.6,0.016666115520282188
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,1,0.6,0.016666115520282188
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,8,0.4222222222222222,0.10831349206349207
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,1,0.40451991747794525,0.1059975484249457
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,4,0.6292532049656926,0.011921001496914019
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,6,0.37777777777777777,0.1557418430335097
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,7,0.2500645911391736,0.3212772414140661
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,8,0.37777777777777777,0.1557418430335097
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,9,0.6,0.016666115520282188
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,1,0.49441323247304414,0.048193488293190756
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,5,0.6741998624632421,0.0070583320485280866
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,6,0.3333333333333333,0.21637345679012346
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,8,0.37777777777777777,0.1557418430335097
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,9,0.6,0.016666115520282188
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,1,0.6,0.016666115520282188
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,7,0.37777777777777777,0.1557418430335097
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,5,0.8090398349558905,0.0012254240706707103
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,6,0.5393598899705937,0.03114121059579671
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,2,0.49441323247304414,0.048193488293190756
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,3,0.37777777777777777,0.1557418430335097
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,6,0.24444444444444444,0.38071979717813054
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,7,0.3146266024828463,0.2086677876982641
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,0,0.6,0.016666115520282188
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,2,0.4666666666666666,0.07255015432098766
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,6,0.3333333333333333,0.21637345679012346
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,7,0.24444444444444444,0.38071979717813054
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,3,0.6,0.016666115520282188
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,4,0.6,0.016666115520282188
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,6,0.4222222222222222,0.10831349206349207
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,7,0.28888888888888886,0.2912483465608466
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,8,0.24444444444444444,0.38071979717813054
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,0,0.8539864924534399,0.0006436975254696865
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,1,0.5393598899705937,0.03114121059579671
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,2,0.6741998624632421,0.0070583320485280866
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,3,0.5393598899705937,0.03114121059579671
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,4,0.6292532049656926,0.011921001496914019
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,6,0.5843065474681431,0.019550269092885535
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,7,0.6292532049656926,0.011921001496914019
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,8,0.7191465199607915,0.004057136032371292
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,9,0.7191465199607915,0.004057136032371292
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,3,0.6,0.016666115520282188
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,8,0.3333333333333333,0.21637345679012346
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,0,0.7191465199607915,0.004057136032371292
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,2,0.5520524474738833,0.029345108174841844
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,3,0.6137949055234262,0.01491204965521593
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,4,0.7191465199607915,0.004057136032371292
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,5,0.7191465199607915,0.004057136032371292
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,6,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,7,0.4494665749754947,0.07248608508684644
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,9,0.8539864924534399,0.0006436975254696865
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,1,0.7640931774583409,0.002263469812035174
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,2,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,7,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,8,0.3333333333333333,0.21637345679012346
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,3,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,0,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,1,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,5,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,7,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,0,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,3,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,7,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,8,0.24444444444444444,0.38071979717813054
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,2,0.6,0.016666115520282188
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755