diff --git "a/metrics.eval.jsonl" "b/metrics.eval.jsonl" --- "a/metrics.eval.jsonl" +++ "b/metrics.eval.jsonl" @@ -158,3 +158,9 @@ {"created_at": "2025-04-27T20:13:04.013175", "global_step": 350000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.37457337883959047, "acc_stderr,none": 0.014144193471893437, "acc_norm,none": 0.4351535836177474, "acc_norm_stderr,none": 0.01448798619718605}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6948653198653199, "acc_stderr,none": 0.009448531094163907, "acc_norm,none": 0.680976430976431, "acc_norm_stderr,none": 0.009564133249441067}, "boolq": {"alias": "boolq", "acc,none": 0.6966360856269113, "acc_stderr,none": 0.008040396817430634}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21375921375921375, "acc_stderr,none": 0.011737086112127208}, "copa": {"alias": "copa", "acc,none": 0.76, "acc_stderr,none": 0.04292346959909282}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4311890061740689, "acc_stderr,none": 0.004942302768002107, "acc_norm,none": 0.5717984465245967, "acc_norm_stderr,none": 0.004938068627349492}, "mmlu": {"acc,none": 0.27253952428429, "acc_stderr,none": 0.0037526792287847868, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2682252922422954, "acc_stderr,none": 0.006461289944670812, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.040061680838488774}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.035243908445117836}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.03213325717373617}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955917}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.04026187527591205}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.0413311944024384}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.31901840490797545, "acc_stderr,none": 0.03661997551073836}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.023532925431044287}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225601}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2990353697749196, "acc_stderr,none": 0.026003301117885135}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.024659685185967284}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26988265971316816, "acc_stderr,none": 0.011337381084250408}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03218093795602357}, "mmlu_other": {"acc,none": 0.2642420341165111, "acc_stderr,none": 0.007894057300169368, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2339622641509434, "acc_stderr,none": 0.02605529690115292}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.033687629322594295}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.18385650224215247, "acc_stderr,none": 0.025998379092356517}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.045416094465039476}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623102}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28735632183908044, "acc_stderr,none": 0.0161824107306827}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.025553169991826503}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.02601199293090202}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2536764705882353, "acc_stderr,none": 0.026431329870789555}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.031755547866299215}, "mmlu_social_sciences": {"acc,none": 0.2723431914202145, "acc_stderr,none": 0.008026832359453557, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022056}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21717171717171718, "acc_stderr,none": 0.029376616484945633}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27461139896373055, "acc_stderr,none": 0.03221024508041154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.258974358974359, "acc_stderr,none": 0.022211106810061682}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.27310924369747897, "acc_stderr,none": 0.028942004040998167}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26422018348623855, "acc_stderr,none": 0.018904164171510196}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728745}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.018054027458815198}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.044612721759105085}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.33877551020408164, "acc_stderr,none": 0.030299506562154185}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_stem": {"acc,none": 0.2873453853472883, "acc_stderr,none": 0.008030116483595804, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34074074074074073, "acc_stderr,none": 0.040943762699967946}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.34210526315789475, "acc_stderr,none": 0.038607315993160925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2847222222222222, "acc_stderr,none": 0.03773809990686934}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.42, "acc_stderr,none": 0.04960449637488584}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.0379328118530781}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292323}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.21379310344827587, "acc_stderr,none": 0.034165204477475494}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525218}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3193548387096774, "acc_stderr,none": 0.02652270967466777}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3054187192118227, "acc_stderr,none": 0.03240661565868408}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.026962424325073828}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473836}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03214952147802748}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.20535714285714285, "acc_stderr,none": 0.03834241021419073}, "mmlu_pro": {"exact_match,custom-extract": 0.12342087765957446, "exact_match_stderr,custom-extract": 0.002985033256051767, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.17154811715481172, "exact_match_stderr,custom-extract": 0.014088673719425003}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12420785804816223, "exact_match_stderr,custom-extract": 0.011749298825998155}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06448763250883392, "exact_match_stderr,custom-extract": 0.007303510883881922}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13414634146341464, "exact_match_stderr,custom-extract": 0.016851944127279108}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17061611374407584, "exact_match_stderr,custom-extract": 0.012956092265003955}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10319917440660474, "exact_match_stderr,custom-extract": 0.009777963967387037}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14058679706601468, "exact_match_stderr,custom-extract": 0.012160802933047539}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09711286089238845, "exact_match_stderr,custom-extract": 0.015190193611399451}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12079927338782924, "exact_match_stderr,custom-extract": 0.009826069635820892}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08660251665433012, "exact_match_stderr,custom-extract": 0.007654701811618176}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12987012987012986, "exact_match_stderr,custom-extract": 0.011064857512116038}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14428857715430862, "exact_match_stderr,custom-extract": 0.015745808625512853}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11778290993071594, "exact_match_stderr,custom-extract": 0.008947290267287298}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.18170426065162906, "exact_match_stderr,custom-extract": 0.013658674004058884}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.256, "acc_stderr,none": 0.019536923574747605, "acc_norm,none": 0.398, "acc_norm_stderr,none": 0.02191237788577997}, "piqa": {"alias": "piqa", "acc,none": 0.6964091403699674, "acc_stderr,none": 0.010728079893076364, "acc_norm,none": 0.6964091403699674, "acc_norm_stderr,none": 0.010728079893076357}, "race": {"alias": "race", "acc,none": 0.3521531100478469, "acc_stderr,none": 0.014782629897202254}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43807574206755373, "acc_stderr,none": 0.01122696506802993}, "winogrande": {"alias": "winogrande", "acc,none": 0.6211523283346487, "acc_stderr,none": 0.013633724603180318}} {"created_at": "2025-04-27T21:41:15.808794", "global_step": 352000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3643344709897611, "acc_stderr,none": 0.014063260279882413, "acc_norm,none": 0.4044368600682594, "acc_norm_stderr,none": 0.014342036483436172}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6734006734006734, "acc_stderr,none": 0.00962304703826764, "acc_norm,none": 0.6628787878787878, "acc_norm_stderr,none": 0.009700146509130075}, "boolq": {"alias": "boolq", "acc,none": 0.6880733944954128, "acc_stderr,none": 0.008102818891778078}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22276822276822278, "acc_stderr,none": 0.011913022964039548}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.433877713602868, "acc_stderr,none": 0.004945956744943811, "acc_norm,none": 0.5787691694881498, "acc_norm_stderr,none": 0.004927473370720143}, "mmlu": {"acc,none": 0.2817974647486113, "acc_stderr,none": 0.003790018675476917, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2639744952178533, "acc_stderr,none": 0.006426340611557341, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3253968253968254, "acc_stderr,none": 0.041905964388711366}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.036085410115739666}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.03132179803083293}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.21518987341772153, "acc_stderr,none": 0.026750826994676173}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.038935425188248475}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.03462419931615624}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.023532925431044276}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25251396648044694, "acc_stderr,none": 0.014530330201468662}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2733118971061093, "acc_stderr,none": 0.025311765975426122}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.27469135802469136, "acc_stderr,none": 0.024836057868294677}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27444589308996087, "acc_stderr,none": 0.011397043163078154}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946956}, "mmlu_other": {"acc,none": 0.28741551335693594, "acc_stderr,none": 0.008085743916042285, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.30566037735849055, "acc_stderr,none": 0.028353298073322666}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3583815028901734, "acc_stderr,none": 0.0365634365335316}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.18834080717488788, "acc_stderr,none": 0.02624113299640727}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.04354631077260595}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.030572811310299607}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2796934865900383, "acc_stderr,none": 0.016050792148036525}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.026336613469046644}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290403}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3897058823529412, "acc_stderr,none": 0.029624663581159696}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.0332939411907353}, "mmlu_social_sciences": {"acc,none": 0.30679233019174523, "acc_stderr,none": 0.008312085053658384, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.04142439719489361}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.29292929292929293, "acc_stderr,none": 0.03242497958178817}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.34196891191709844, "acc_stderr,none": 0.03423465100104283}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2923076923076923, "acc_stderr,none": 0.023060438380857733}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3445378151260504, "acc_stderr,none": 0.03086868260412163}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.326605504587156, "acc_stderr,none": 0.020106990889937306}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.03880848301082397}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.01824902441120766}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721377}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3551020408163265, "acc_stderr,none": 0.030635655150387634}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31840796019900497, "acc_stderr,none": 0.03294118479054096}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_stem": {"acc,none": 0.278464954012052, "acc_stderr,none": 0.007978595346692968, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.03852084696008534}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.04576665403207762}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.251063829787234, "acc_stderr,none": 0.02834696377716245}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.022569897074918417}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3, "acc_stderr,none": 0.026069362295335144}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.03127090713297698}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21851851851851853, "acc_stderr,none": 0.02519575225182379}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31125827814569534, "acc_stderr,none": 0.03780445850526732}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.36574074074074076, "acc_stderr,none": 0.03284738857647207}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340456}, "mmlu_pro": {"exact_match,custom-extract": 0.1254155585106383, "exact_match_stderr,custom-extract": 0.0030021632590520922, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.18688981868898186, "exact_match_stderr,custom-extract": 0.014568371570721193}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10646387832699619, "exact_match_stderr,custom-extract": 0.010987378600044825}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07332155477031801, "exact_match_stderr,custom-extract": 0.007750845159494202}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.14634146341463414, "exact_match_stderr,custom-extract": 0.017476889350508576}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.19075829383886256, "exact_match_stderr,custom-extract": 0.013532157875306485}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09391124871001032, "exact_match_stderr,custom-extract": 0.009375760359013255}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14547677261613692, "exact_match_stderr,custom-extract": 0.012335243774582429}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11286089238845144, "exact_match_stderr,custom-extract": 0.016232140903461433}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10535876475930972, "exact_match_stderr,custom-extract": 0.009256854730301894}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0917838638045892, "exact_match_stderr,custom-extract": 0.007857979485361452}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.14502164502164502, "exact_match_stderr,custom-extract": 0.011590258522971733}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1462925851703407, "exact_match_stderr,custom-extract": 0.015836201263905444}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1100846805234796, "exact_match_stderr,custom-extract": 0.008687612438998108}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.18045112781954886, "exact_match_stderr,custom-extract": 0.013621911931802996}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.258, "acc_stderr,none": 0.019586711785215837, "acc_norm,none": 0.372, "acc_norm_stderr,none": 0.0216371979857224}, "piqa": {"alias": "piqa", "acc,none": 0.6920565832426551, "acc_stderr,none": 0.010770892367463682, "acc_norm,none": 0.7018498367791077, "acc_norm_stderr,none": 0.010672964114008301}, "race": {"alias": "race", "acc,none": 0.3617224880382775, "acc_stderr,none": 0.014871072026717747}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43398157625383826, "acc_stderr,none": 0.011215013703683814}, "winogrande": {"alias": "winogrande", "acc,none": 0.6124704025256511, "acc_stderr,none": 0.01369235463601677}} {"created_at": "2025-04-27T23:37:42.849179", "global_step": 354000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.35580204778157, "acc_stderr,none": 0.01399057113791876, "acc_norm,none": 0.4052901023890785, "acc_norm_stderr,none": 0.01434686906022933}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6910774410774411, "acc_stderr,none": 0.00948104838776135, "acc_norm,none": 0.6822390572390572, "acc_norm_stderr,none": 0.009554033064443062}, "boolq": {"alias": "boolq", "acc,none": 0.7345565749235474, "acc_stderr,none": 0.0077230909835904705}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2153972153972154, "acc_stderr,none": 0.011769690686226967}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4342760406293567, "acc_stderr,none": 0.004946485466544621, "acc_norm,none": 0.5718980282812188, "acc_norm_stderr,none": 0.00493792432674258}, "mmlu": {"acc,none": 0.2839339125480701, "acc_stderr,none": 0.003790112482847812, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25759829968119025, "acc_stderr,none": 0.006372354532171793, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.04104947269903394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3393939393939394, "acc_stderr,none": 0.03697442205031596}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.03256685484460388}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24472573839662448, "acc_stderr,none": 0.027985699387036416}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.04139112727635461}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.03957835471980979}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212696}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.22254335260115607, "acc_stderr,none": 0.02239421566194282}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2829581993569132, "acc_stderr,none": 0.025583062489984827}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24382716049382716, "acc_stderr,none": 0.023891879541959607}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25749674054758803, "acc_stderr,none": 0.011167706014904156}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.032180937956023566}, "mmlu_other": {"acc,none": 0.29127775989700677, "acc_stderr,none": 0.008143078852547598, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3132075471698113, "acc_stderr,none": 0.028544793319055326}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.32947976878612717, "acc_stderr,none": 0.03583901754736411}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.26905829596412556, "acc_stderr,none": 0.029763779406874972}, "mmlu_management": {"alias": " - management", "acc,none": 0.36893203883495146, "acc_stderr,none": 0.047776151811567386}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623102}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2707535121328225, "acc_stderr,none": 0.015889888362560486}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2973856209150327, "acc_stderr,none": 0.026173908506718576}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.025645553622266733}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3639705882352941, "acc_stderr,none": 0.02922719246003203}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.303542411439714, "acc_stderr,none": 0.008261698180047591, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281337}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03191178226713547}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.31088082901554404, "acc_stderr,none": 0.03340361906276587}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.33076923076923076, "acc_stderr,none": 0.02385479568097112}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.031041941304059288}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.29541284403669726, "acc_stderr,none": 0.019560619182976}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.03880848301082396}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26633986928104575, "acc_stderr,none": 0.01788318813466719}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884601}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.40816326530612246, "acc_stderr,none": 0.03146465712827423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.34328358208955223, "acc_stderr,none": 0.03357379665433431}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.29686013320647003, "acc_stderr,none": 0.00806892615616774, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.03547854198560824}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.32894736842105265, "acc_stderr,none": 0.038234289699266046}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3125, "acc_stderr,none": 0.038760854559127644}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.04655010411319615}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20851063829787234, "acc_stderr,none": 0.026556982117838728}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03855289616378948}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.022569897074918424}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3419354838709677, "acc_stderr,none": 0.02698528957655273}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.32019704433497537, "acc_stderr,none": 0.0328264938530415}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.026842057873833706}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.3841059602649007, "acc_stderr,none": 0.03971301814719198}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4444444444444444, "acc_stderr,none": 0.03388857118502325}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.15178571428571427, "acc_stderr,none": 0.03405702838185691}, "mmlu_pro": {"exact_match,custom-extract": 0.12400265957446809, "exact_match_stderr,custom-extract": 0.0029813654329394123, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.199442119944212, "exact_match_stderr,custom-extract": 0.014933042396591396}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10646387832699619, "exact_match_stderr,custom-extract": 0.010987378600044838}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06713780918727916, "exact_match_stderr,custom-extract": 0.0074415092498656375}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.15365853658536585, "exact_match_stderr,custom-extract": 0.01783156665820722}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1966824644549763, "exact_match_stderr,custom-extract": 0.013690290289032438}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09907120743034056, "exact_match_stderr,custom-extract": 0.009602432935115167}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.17603911980440098, "exact_match_stderr,custom-extract": 0.013324375473773221}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11811023622047244, "exact_match_stderr,custom-extract": 0.01655614119804242}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10626702997275204, "exact_match_stderr,custom-extract": 0.009291949023141264}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10288675055514433, "exact_match_stderr,custom-extract": 0.00826868555613196}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1277056277056277, "exact_match_stderr,custom-extract": 0.010985901551102233}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13226452905811623, "exact_match_stderr,custom-extract": 0.015181011139551245}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07467282525019246, "exact_match_stderr,custom-extract": 0.007296113874717256}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.17293233082706766, "exact_match_stderr,custom-extract": 0.013396133254614466}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.252, "acc_stderr,none": 0.01943572728224952, "acc_norm,none": 0.382, "acc_norm_stderr,none": 0.021750820591250834}, "piqa": {"alias": "piqa", "acc,none": 0.6871599564744287, "acc_stderr,none": 0.0108177144257011, "acc_norm,none": 0.6985854189336235, "acc_norm_stderr,none": 0.010706248242753761}, "race": {"alias": "race", "acc,none": 0.3799043062200957, "acc_stderr,none": 0.01502160080493565}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4390992835209826, "acc_stderr,none": 0.011229831502847804}, "winogrande": {"alias": "winogrande", "acc,none": 0.6227308602999211, "acc_stderr,none": 0.013622567928799501}} +{"created_at": "2025-04-28T01:34:32.186262", "global_step": 356000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3464163822525597, "acc_stderr,none": 0.013905011180063239, "acc_norm,none": 0.3924914675767918, "acc_norm_stderr,none": 0.014269634635670709}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6721380471380471, "acc_stderr,none": 0.009632587076170011, "acc_norm,none": 0.6430976430976431, "acc_norm_stderr,none": 0.009830630210347016}, "boolq": {"alias": "boolq", "acc,none": 0.6932721712538227, "acc_stderr,none": 0.008065309051771783}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21785421785421785, "acc_stderr,none": 0.011818079981132525}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4355706034654451, "acc_stderr,none": 0.00494818136702494, "acc_norm,none": 0.577275443138817, "acc_norm_stderr,none": 0.004929828337606974}, "mmlu": {"acc,none": 0.259792052414186, "acc_stderr,none": 0.0036965480296688247, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25377258235919237, "acc_stderr,none": 0.006340367871161165, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.0361960452412425}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624337}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.03198001660115069}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.31645569620253167, "acc_stderr,none": 0.03027497488021897}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302872}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094633}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2630057803468208, "acc_stderr,none": 0.023703099525258172}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3054662379421222, "acc_stderr,none": 0.026160584450140478}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023132376234543332}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24445893089960888, "acc_stderr,none": 0.01097642501311389}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.03126781714663179}, "mmlu_other": {"acc,none": 0.28934663662697135, "acc_stderr,none": 0.0081247632916482, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2943396226415094, "acc_stderr,none": 0.028049186315695248}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.03435568056047873}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.37668161434977576, "acc_stderr,none": 0.032521134899291884}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.044986763205729224}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3418803418803419, "acc_stderr,none": 0.03107502852650776}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2835249042145594, "acc_stderr,none": 0.016117318166832272}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.024404394928087866}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.02699219917306436}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23897058823529413, "acc_stderr,none": 0.025905280644893006}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.03629335329947861}, "mmlu_social_sciences": {"acc,none": 0.2577185570360741, "acc_stderr,none": 0.007895639275638942, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.041857744240220575}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23232323232323232, "acc_stderr,none": 0.030088629490217483}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2538860103626943, "acc_stderr,none": 0.03141024780565318}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.022139081103971538}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.28991596638655465, "acc_stderr,none": 0.029472485833136084}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25871559633027524, "acc_stderr,none": 0.01877605231961962}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596919}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.017952449196987866}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.04172343038705383}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.02721283588407315}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.03076944496729601}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_stem": {"acc,none": 0.241674595623216, "acc_stderr,none": 0.007617733906661431, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.03712537833614865}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19736842105263158, "acc_stderr,none": 0.03238981601699397}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.035146974678623884}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.15, "acc_stderr,none": 0.03588702812826371}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.04158307533083286}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3148936170212766, "acc_stderr,none": 0.030363582197238174}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.0220190800122179}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.267741935483871, "acc_stderr,none": 0.02518900666021238}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.24630541871921183, "acc_stderr,none": 0.030315099285617708}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.024556172219141265}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.026991454502036712}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.12691156914893617, "exact_match_stderr,custom-extract": 0.003022744785205824, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.16736401673640167, "exact_match_stderr,custom-extract": 0.013950896661189977}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11913814955640051, "exact_match_stderr,custom-extract": 0.011540276571470737}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07685512367491167, "exact_match_stderr,custom-extract": 0.007920271010098666}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11951219512195121, "exact_match_stderr,custom-extract": 0.016040065235546762}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1670616113744076, "exact_match_stderr,custom-extract": 0.012847865601311619}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09494324045407637, "exact_match_stderr,custom-extract": 0.00942176471567824}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.18092909535452323, "exact_match_stderr,custom-extract": 0.01346802541043788}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10761154855643044, "exact_match_stderr,custom-extract": 0.015896979452723368}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12352406902815623, "exact_match_stderr,custom-extract": 0.009920862929791524}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.09548482605477424, "exact_match_stderr,custom-extract": 0.007998494027144747}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1396103896103896, "exact_match_stderr,custom-extract": 0.011407897167896806}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13827655310621242, "exact_match_stderr,custom-extract": 0.015468334539576873}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12086220169361046, "exact_match_stderr,custom-extract": 0.009047662268420943}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16917293233082706, "exact_match_stderr,custom-extract": 0.013279801895764033}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.254, "acc_stderr,none": 0.01948659680164338, "acc_norm,none": 0.38, "acc_norm_stderr,none": 0.02172888143870171}, "piqa": {"alias": "piqa", "acc,none": 0.690424374319913, "acc_stderr,none": 0.010786656752183345, "acc_norm,none": 0.70620239390642, "acc_norm_stderr,none": 0.010627574080514797}, "race": {"alias": "race", "acc,none": 0.369377990430622, "acc_stderr,none": 0.014937221457864277}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43091095189355166, "acc_stderr,none": 0.01120553917756672}, "winogrande": {"alias": "winogrande", "acc,none": 0.6069455406471981, "acc_stderr,none": 0.01372727624910844}} +{"created_at": "2025-04-28T03:49:46.252305", "global_step": 358000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3378839590443686, "acc_stderr,none": 0.01382204792228351, "acc_norm,none": 0.40187713310580203, "acc_norm_stderr,none": 0.014327268614578276}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6742424242424242, "acc_stderr,none": 0.009616642976885963, "acc_norm,none": 0.6574074074074074, "acc_norm_stderr,none": 0.009738105469984187}, "boolq": {"alias": "boolq", "acc,none": 0.6483180428134556, "acc_stderr,none": 0.008351445237661383}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2203112203112203, "acc_stderr,none": 0.01186585494340244}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.04725815626252607}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.43238398725353516, "acc_stderr,none": 0.004943945069611446, "acc_norm,none": 0.5764787890858395, "acc_norm_stderr,none": 0.004931065434173696}, "mmlu": {"acc,none": 0.2766699900299103, "acc_stderr,none": 0.0037667272286089916, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2786397449521785, "acc_stderr,none": 0.006526378345632879, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848878}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.36764705882352944, "acc_stderr,none": 0.03384132045674118}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3670886075949367, "acc_stderr,none": 0.03137624072561619}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.30578512396694213, "acc_stderr,none": 0.04205953933884123}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3148148148148148, "acc_stderr,none": 0.04489931073591311}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3128834355828221, "acc_stderr,none": 0.036429145782924055}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.014310999547961447}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.26366559485530544, "acc_stderr,none": 0.025025538500532338}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02438366553103545}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27509778357235987, "acc_stderr,none": 0.011405443620996932}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.3009333762471838, "acc_stderr,none": 0.008200055670980819, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.4, "acc_stderr,none": 0.04923659639173309}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3018867924528302, "acc_stderr,none": 0.028254200344438655}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.30057803468208094, "acc_stderr,none": 0.03496101481191181}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.37, "acc_stderr,none": 0.048523658709390974}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.34977578475336324, "acc_stderr,none": 0.03200736719484503}, "mmlu_management": {"alias": " - management", "acc,none": 0.24271844660194175, "acc_stderr,none": 0.04245022486384495}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3547008547008547, "acc_stderr,none": 0.03134250486245402}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3243933588761175, "acc_stderr,none": 0.016740929047162696}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.02526169121972948}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340461008}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1948529411764706, "acc_stderr,none": 0.024060599423487414}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553026}, "mmlu_social_sciences": {"acc,none": 0.2720181995450114, "acc_stderr,none": 0.008019978886083749, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2474747474747475, "acc_stderr,none": 0.0307463007421245}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.26424870466321243, "acc_stderr,none": 0.031821550509166484}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2717948717948718, "acc_stderr,none": 0.022556551010132368}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279483}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26972477064220185, "acc_stderr,none": 0.01902848671111545}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728744}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.01852175621542302}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23265306122448978, "acc_stderr,none": 0.02704925791589618}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.32338308457711445, "acc_stderr,none": 0.03307615947979035}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_stem": {"acc,none": 0.25436092610212496, "acc_stderr,none": 0.007746447967555467, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04072314811876837}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.035834961763610625}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03800968060554858}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171452}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2723404255319149, "acc_stderr,none": 0.0291012906983867}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.21379310344827587, "acc_stderr,none": 0.03416520447747548}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.022860838309232072}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27741935483870966, "acc_stderr,none": 0.02547019683590005}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1921182266009852, "acc_stderr,none": 0.02771931570961478}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.025928876132766097}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.03543304234389985}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19907407407407407, "acc_stderr,none": 0.02723229846269024}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.1260804521276596, "exact_match_stderr,custom-extract": 0.003003378717388802, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.20920502092050208, "exact_match_stderr,custom-extract": 0.015200626647538911}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10899873257287707, "exact_match_stderr,custom-extract": 0.011101630697795408}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0715547703180212, "exact_match_stderr,custom-extract": 0.007664187803003869}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.15365853658536585, "exact_match_stderr,custom-extract": 0.01783156665820722}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1848341232227488, "exact_match_stderr,custom-extract": 0.013369041898186952}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09597523219814241, "exact_match_stderr,custom-extract": 0.009467429322574824}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1821515892420538, "exact_match_stderr,custom-extract": 0.01350336046749538}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12598425196850394, "exact_match_stderr,custom-extract": 0.01702260263856952}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09627611262488647, "exact_match_stderr,custom-extract": 0.008893665915732372}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08660251665433012, "exact_match_stderr,custom-extract": 0.007654701811618188}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13203463203463203, "exact_match_stderr,custom-extract": 0.011142798517705924}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15831663326653306, "exact_match_stderr,custom-extract": 0.016357727678825804}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10007698229407236, "exact_match_stderr,custom-extract": 0.008329758962147563}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.17167919799498746, "exact_match_stderr,custom-extract": 0.013357616212456415}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.27, "acc_stderr,none": 0.01987435483128749, "acc_norm,none": 0.378, "acc_norm_stderr,none": 0.021706550824518184}, "piqa": {"alias": "piqa", "acc,none": 0.6936887921653971, "acc_stderr,none": 0.010754970032367318, "acc_norm,none": 0.70620239390642, "acc_norm_stderr,none": 0.010627574080514799}, "race": {"alias": "race", "acc,none": 0.3645933014354067, "acc_stderr,none": 0.014896354113839586}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43244626407369496, "acc_stderr,none": 0.011210331273967561}, "winogrande": {"alias": "winogrande", "acc,none": 0.6124704025256511, "acc_stderr,none": 0.01369235463601677}} +{"created_at": "2025-04-28T05:31:41.477272", "global_step": 360000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3438566552901024, "acc_stderr,none": 0.013880644570156211, "acc_norm,none": 0.3967576791808874, "acc_norm_stderr,none": 0.014296513020180628}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6675084175084175, "acc_stderr,none": 0.009666892606130127, "acc_norm,none": 0.6485690235690236, "acc_norm_stderr,none": 0.00979639558281772}, "boolq": {"alias": "boolq", "acc,none": 0.7290519877675841, "acc_stderr,none": 0.007773467255881215}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.24488124488124488, "acc_stderr,none": 0.012311344255040359}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.04688261722621504}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4362676757618004, "acc_stderr,none": 0.004949080334816036, "acc_norm,none": 0.5804620593507269, "acc_norm_stderr,none": 0.004924748500639343}, "mmlu": {"acc,none": 0.26378008830650906, "acc_stderr,none": 0.003709119387579032, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2678002125398512, "acc_stderr,none": 0.006436422139490316, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.038522733649243156}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624336}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.03256685484460389}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4092827004219409, "acc_stderr,none": 0.032007041833595914}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.0398497965330287}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.04524596007030048}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25766871165644173, "acc_stderr,none": 0.03436150827846917}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28034682080924855, "acc_stderr,none": 0.02418242749657761}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2446927374301676, "acc_stderr,none": 0.014378169884098417}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19614147909967847, "acc_stderr,none": 0.022552447780478022}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24382716049382716, "acc_stderr,none": 0.023891879541959607}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2666232073011734, "acc_stderr,none": 0.011293836031612135}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.27485380116959063, "acc_stderr,none": 0.03424042924691584}, "mmlu_other": {"acc,none": 0.28644995172191823, "acc_stderr,none": 0.008097203373421524, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3018867924528302, "acc_stderr,none": 0.02825420034443866}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.03391750322321659}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3632286995515695, "acc_stderr,none": 0.032277904428505}, "mmlu_management": {"alias": " - management", "acc,none": 0.24271844660194175, "acc_stderr,none": 0.04245022486384495}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.02999695185834948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2771392081736909, "acc_stderr,none": 0.016005636294122428}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912255}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.025767252010855966}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3253012048192771, "acc_stderr,none": 0.03647168523683227}, "mmlu_social_sciences": {"acc,none": 0.24764380890477738, "acc_stderr,none": 0.007784042500719237, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.0298575156733864}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24352331606217617, "acc_stderr,none": 0.03097543638684542}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2230769230769231, "acc_stderr,none": 0.021107730127244}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.02772206549336126}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24220183486238533, "acc_stderr,none": 0.01836817630659862}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467765}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.017917974069594726}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.025801283475090506}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2835820895522388, "acc_stderr,none": 0.03187187537919797}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_stem": {"acc,none": 0.25118934348239774, "acc_stderr,none": 0.0076969623752900646, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3111111111111111, "acc_stderr,none": 0.03999262876617722}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.03279000406310051}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03800968060554858}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768081}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.023068188848261124}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24516129032258063, "acc_stderr,none": 0.024472243840895525}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.02850137816789395}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.02438843043398766}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2185430463576159, "acc_stderr,none": 0.03374235550425694}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.17592592592592593, "acc_stderr,none": 0.025967420958258533}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.38392857142857145, "acc_stderr,none": 0.04616143075028547}, "mmlu_pro": {"exact_match,custom-extract": 0.14353390957446807, "exact_match_stderr,custom-extract": 0.0031710513446770398, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.2384937238493724, "exact_match_stderr,custom-extract": 0.01592643999671732}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11913814955640051, "exact_match_stderr,custom-extract": 0.011540276571470737}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07243816254416961, "exact_match_stderr,custom-extract": 0.007707683029020952}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.16341463414634147, "exact_match_stderr,custom-extract": 0.018282641806528298}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.19549763033175355, "exact_match_stderr,custom-extract": 0.013659054104691959}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10732714138286893, "exact_match_stderr,custom-extract": 0.009948629733788088}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.19926650366748166, "exact_match_stderr,custom-extract": 0.013974945415562682}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13648293963254593, "exact_match_stderr,custom-extract": 0.017610952544682614}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11716621253405994, "exact_match_stderr,custom-extract": 0.009697154745523068}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.11621021465581051, "exact_match_stderr,custom-extract": 0.00872227462116445}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.14935064935064934, "exact_match_stderr,custom-extract": 0.011732160468660276}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13627254509018036, "exact_match_stderr,custom-extract": 0.015373681322287381}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12779060816012316, "exact_match_stderr,custom-extract": 0.009266644485474497}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.21428571428571427, "exact_match_stderr,custom-extract": 0.014534489201025444}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.27, "acc_stderr,none": 0.01987435483128749, "acc_norm,none": 0.388, "acc_norm_stderr,none": 0.021814300984787635}, "piqa": {"alias": "piqa", "acc,none": 0.6936887921653971, "acc_stderr,none": 0.010754970032367321, "acc_norm,none": 0.6980413492927094, "acc_norm_stderr,none": 0.010711732891588346}, "race": {"alias": "race", "acc,none": 0.3617224880382775, "acc_stderr,none": 0.014871072026717747}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.42067553735926305, "acc_stderr,none": 0.011170778517705614}, "winogrande": {"alias": "winogrande", "acc,none": 0.6393054459352802, "acc_stderr,none": 0.013496064394234033}} +{"created_at": "2025-04-28T07:08:18.738056", "global_step": 362000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.36006825938566556, "acc_stderr,none": 0.014027516814585186, "acc_norm,none": 0.4129692832764505, "acc_norm_stderr,none": 0.014388344935398326}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6898148148148148, "acc_stderr,none": 0.009491721291998515, "acc_norm,none": 0.6835016835016835, "acc_norm_stderr,none": 0.009543851857323891}, "boolq": {"alias": "boolq", "acc,none": 0.7382262996941896, "acc_stderr,none": 0.007688653730439844}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2285012285012285, "acc_stderr,none": 0.012020761312005539}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.43845847440748853, "acc_stderr,none": 0.004951840978219681, "acc_norm,none": 0.5839474208325035, "acc_norm_stderr,none": 0.004918951019183904}, "mmlu": {"acc,none": 0.27182737501780374, "acc_stderr,none": 0.0037484587676164887, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2682252922422954, "acc_stderr,none": 0.006454725249176063, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.038095238095238106}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3151515151515151, "acc_stderr,none": 0.0362773057502241}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.031980016601150726}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.03849856098794088}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3312883435582822, "acc_stderr,none": 0.03697983910025588}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.023532925431044287}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24804469273743016, "acc_stderr,none": 0.01444415780826146}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2958199356913183, "acc_stderr,none": 0.025922371788818777}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.31790123456790126, "acc_stderr,none": 0.025910063528240865}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2653194263363755, "acc_stderr,none": 0.011276198843958873}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.19883040935672514, "acc_stderr,none": 0.030611116557432528}, "mmlu_other": {"acc,none": 0.26295461860315417, "acc_stderr,none": 0.007886749202197592, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899105}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.28901734104046245, "acc_stderr,none": 0.03456425745087001}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21076233183856502, "acc_stderr,none": 0.027373095500540193}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623101}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28607918263090676, "acc_stderr,none": 0.01616087140512753}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.30718954248366015, "acc_stderr,none": 0.026415601914389}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21323529411764705, "acc_stderr,none": 0.024880971512294285}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.03329394119073531}, "mmlu_social_sciences": {"acc,none": 0.2853428664283393, "acc_stderr,none": 0.008110999532419168, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.19298245614035087, "acc_stderr,none": 0.03712454853721368}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.29292929292929293, "acc_stderr,none": 0.03242497958178815}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2849740932642487, "acc_stderr,none": 0.03257714077709659}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2846153846153846, "acc_stderr,none": 0.022878322799706297}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2815126050420168, "acc_stderr,none": 0.02921354941437216}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27706422018348625, "acc_stderr,none": 0.01918848259016953}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.01798661530403031}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.19090909090909092, "acc_stderr,none": 0.03764425585984926}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.4, "acc_stderr,none": 0.03136250240935893}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.36318407960199006, "acc_stderr,none": 0.034005985055990146}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_stem": {"acc,none": 0.272756105296543, "acc_stderr,none": 0.007928359646207219, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3037037037037037, "acc_stderr,none": 0.03972552884785136}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.037455547914624555}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.19148936170212766, "acc_stderr,none": 0.025722149992637805}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03855289616378947}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525218}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3161290322580645, "acc_stderr,none": 0.02645087448904277}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.26108374384236455, "acc_stderr,none": 0.030903796952114485}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389023}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02988691054762696}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755805}, "mmlu_pro": {"exact_match,custom-extract": 0.13771609042553193, "exact_match_stderr,custom-extract": 0.0031277699073629698, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.2217573221757322, "exact_match_stderr,custom-extract": 0.015525299781071858}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10139416983523447, "exact_match_stderr,custom-extract": 0.010752959229023352}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.09628975265017668, "exact_match_stderr,custom-extract": 0.008771489271271223}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.14878048780487804, "exact_match_stderr,custom-extract": 0.017596736073033845}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17772511848341233, "exact_match_stderr,custom-extract": 0.013166463235967815}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10319917440660474, "exact_match_stderr,custom-extract": 0.009777963967387046}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.16136919315403422, "exact_match_stderr,custom-extract": 0.01287018209245067}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13123359580052493, "exact_match_stderr,custom-extract": 0.01732136945584154}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11989100817438691, "exact_match_stderr,custom-extract": 0.009794114853194124}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.12213175425610659, "exact_match_stderr,custom-extract": 0.008911731297001183}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13636363636363635, "exact_match_stderr,custom-extract": 0.011295719428226613}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15030060120240482, "exact_match_stderr,custom-extract": 0.01601394538357726}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12779060816012316, "exact_match_stderr,custom-extract": 0.009266644485474505}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.19047619047619047, "exact_match_stderr,custom-extract": 0.01390932327432391}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.266, "acc_stderr,none": 0.019780559675655486, "acc_norm,none": 0.376, "acc_norm_stderr,none": 0.02168382753928612}, "piqa": {"alias": "piqa", "acc,none": 0.7007616974972797, "acc_stderr,none": 0.010684130673134581, "acc_norm,none": 0.7040261153427638, "acc_norm_stderr,none": 0.010650414317148126}, "race": {"alias": "race", "acc,none": 0.3588516746411483, "acc_stderr,none": 0.014845215125262313}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43807574206755373, "acc_stderr,none": 0.01122696506802993}, "winogrande": {"alias": "winogrande", "acc,none": 0.6235201262825573, "acc_stderr,none": 0.013616931960667185}} +{"created_at": "2025-04-28T09:09:01.054124", "global_step": 364000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.36177474402730375, "acc_stderr,none": 0.014041957945038071, "acc_norm,none": 0.40784982935153585, "acc_norm_stderr,none": 0.014361097288449696}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6763468013468014, "acc_stderr,none": 0.00960047818227378, "acc_norm,none": 0.6544612794612794, "acc_norm_stderr,none": 0.0097579487306703}, "boolq": {"alias": "boolq", "acc,none": 0.5495412844036697, "acc_stderr,none": 0.008702022442950883}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19901719901719903, "acc_stderr,none": 0.011430809442838375}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4355706034654451, "acc_stderr,none": 0.004948181367024938, "acc_norm,none": 0.5819557857000598, "acc_norm_stderr,none": 0.004922294797766665}, "mmlu": {"acc,none": 0.2591511180743484, "acc_stderr,none": 0.003692299952817129, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27927736450584484, "acc_stderr,none": 0.006533988951587552, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.033333333333333354}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.296969696969697, "acc_stderr,none": 0.03567969772268049}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.03132179803083292}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29957805907172996, "acc_stderr,none": 0.029818024749753102}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.371900826446281, "acc_stderr,none": 0.04412015806624504}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252628}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.036230899157241474}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2947976878612717, "acc_stderr,none": 0.02454761779480383}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3022508038585209, "acc_stderr,none": 0.026082700695399655}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2993827160493827, "acc_stderr,none": 0.02548311560119547}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2816166883963494, "acc_stderr,none": 0.011487783272786696}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824563}, "mmlu_other": {"acc,none": 0.24299967814612167, "acc_stderr,none": 0.007685020653974862, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.02575755989310674}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.20179372197309417, "acc_stderr,none": 0.02693611191280227}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822585}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.028911208802749486}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2707535121328225, "acc_stderr,none": 0.01588988836256049}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.0248480182638752}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.031417842916639245}, "mmlu_social_sciences": {"acc,none": 0.23756906077348067, "acc_stderr,none": 0.007676764149375163, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.03775205013583638}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365907}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.03027690994517826}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.02102067268082791}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.02702543349888238}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23119266055045873, "acc_stderr,none": 0.018075750241633156}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2761437908496732, "acc_stderr,none": 0.018087276935663137}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.038950910157241364}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2163265306122449, "acc_stderr,none": 0.02635891633490402}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916718}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_stem": {"acc,none": 0.2660957817951158, "acc_stderr,none": 0.00786470145985221, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.32592592592592595, "acc_stderr,none": 0.040491220417025055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2, "acc_stderr,none": 0.0261488180184245}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.024892469172462836}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.031947400722655395}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02671924078371216}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389023}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.20833333333333334, "acc_stderr,none": 0.02769691071309394}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952687}, "mmlu_pro": {"exact_match,custom-extract": 0.13472406914893617, "exact_match_stderr,custom-extract": 0.003094262631605881, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.20641562064156208, "exact_match_stderr,custom-extract": 0.015125555172262694}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09759188846641319, "exact_match_stderr,custom-extract": 0.010571710152486613}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06978798586572438, "exact_match_stderr,custom-extract": 0.007576175072607249}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.14146341463414633, "exact_match_stderr,custom-extract": 0.01723216394465976}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17061611374407584, "exact_match_stderr,custom-extract": 0.012956092265003962}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.11661506707946337, "exact_match_stderr,custom-extract": 0.01031607874089496}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.17603911980440098, "exact_match_stderr,custom-extract": 0.013324375473773215}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.15223097112860892, "exact_match_stderr,custom-extract": 0.01842886055804924}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12806539509536785, "exact_match_stderr,custom-extract": 0.010075381773702277}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10066617320503331, "exact_match_stderr,custom-extract": 0.008189084640082613}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13095238095238096, "exact_match_stderr,custom-extract": 0.011103953542030812}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14829659318637275, "exact_match_stderr,custom-extract": 0.0159255744939775}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12702078521939955, "exact_match_stderr,custom-extract": 0.00924276693598881}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.20426065162907267, "exact_match_stderr,custom-extract": 0.01428067096264084}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.26, "acc_stderr,none": 0.019635965529725512, "acc_norm,none": 0.374, "acc_norm_stderr,none": 0.021660710347204484}, "piqa": {"alias": "piqa", "acc,none": 0.6849836779107725, "acc_stderr,none": 0.01083807274624065, "acc_norm,none": 0.6953210010881393, "acc_norm_stderr,none": 0.010738889044325161}, "race": {"alias": "race", "acc,none": 0.36650717703349284, "acc_stderr,none": 0.014912890943719231}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4247697031729785, "acc_stderr,none": 0.011185271257671336}, "winogrande": {"alias": "winogrande", "acc,none": 0.6195737963693765, "acc_stderr,none": 0.013644727908656831}} +{"created_at": "2025-04-28T11:04:11.467245", "global_step": 366000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3395904436860068, "acc_stderr,none": 0.013839039762820167, "acc_norm,none": 0.3916382252559727, "acc_norm_stderr,none": 0.01426412212493822}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6611952861952862, "acc_stderr,none": 0.009711980224301631, "acc_norm,none": 0.6346801346801347, "acc_norm_stderr,none": 0.009880576614806924}, "boolq": {"alias": "boolq", "acc,none": 0.7048929663608563, "acc_stderr,none": 0.00797707928526976}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19574119574119575, "acc_stderr,none": 0.011359497363584395}, "copa": {"alias": "copa", "acc,none": 0.74, "acc_stderr,none": 0.0440844002276808}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.43487353116908983, "acc_stderr,none": 0.004947272454226198, "acc_norm,none": 0.5785700059749054, "acc_norm_stderr,none": 0.004927790036726622}, "mmlu": {"acc,none": 0.2649195271328871, "acc_stderr,none": 0.0037203596269857545, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2760892667375133, "acc_stderr,none": 0.006510322242272694, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.033333333333333354}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624336}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3284313725490196, "acc_stderr,none": 0.03296245110172229}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3037974683544304, "acc_stderr,none": 0.029936696387138605}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04391326286724071}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.036230899157241474}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.29190751445086704, "acc_stderr,none": 0.024476994076247333}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2958199356913183, "acc_stderr,none": 0.025922371788818777}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2993827160493827, "acc_stderr,none": 0.02548311560119547}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26792698826597133, "acc_stderr,none": 0.011311347690633869}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.03446296217088427}, "mmlu_other": {"acc,none": 0.2481493401995494, "acc_stderr,none": 0.007739860107405158, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2339622641509434, "acc_stderr,none": 0.02605529690115292}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.03435568056047874}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2062780269058296, "acc_stderr,none": 0.02715715047956382}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2264957264957265, "acc_stderr,none": 0.02742100729539292}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2720306513409962, "acc_stderr,none": 0.015913367447500517}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.025360603796242553}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20955882352941177, "acc_stderr,none": 0.02472311040767705}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.1746987951807229, "acc_stderr,none": 0.029560326211256847}, "mmlu_social_sciences": {"acc,none": 0.2560935976600585, "acc_stderr,none": 0.007875694449031584, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.03115626951964684}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.25906735751295334, "acc_stderr,none": 0.03161877917935411}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.02213908110397153}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24369747899159663, "acc_stderr,none": 0.02788682807838056}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23486238532110093, "acc_stderr,none": 0.018175110510343578}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2761437908496732, "acc_stderr,none": 0.018087276935663137}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878284}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2979591836734694, "acc_stderr,none": 0.02927956741106567}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.03076944496729602}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_stem": {"acc,none": 0.27339042182048845, "acc_stderr,none": 0.007931019384985402, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.32592592592592595, "acc_stderr,none": 0.040491220417025055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351586}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2847222222222222, "acc_stderr,none": 0.03773809990686935}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171452}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20425531914893616, "acc_stderr,none": 0.02635515841334943}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.03752833958003336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27419354838709675, "acc_stderr,none": 0.025378139970885196}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.30049261083743845, "acc_stderr,none": 0.03225799476233485}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763743}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863438}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "mmlu_pro": {"exact_match,custom-extract": 0.1432845744680851, "exact_match_stderr,custom-extract": 0.0031775330528792793, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.19804741980474197, "exact_match_stderr,custom-extract": 0.014893694032916231}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11913814955640051, "exact_match_stderr,custom-extract": 0.011540276571470732}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.09717314487632508, "exact_match_stderr,custom-extract": 0.008807325782377471}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.15609756097560976, "exact_match_stderr,custom-extract": 0.01794661415125632}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.20497630331753555, "exact_match_stderr,custom-extract": 0.013903626023537346}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.11661506707946337, "exact_match_stderr,custom-extract": 0.01031607874089497}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.19193154034229828, "exact_match_stderr,custom-extract": 0.01377800138301344}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14173228346456693, "exact_match_stderr,custom-extract": 0.01789179783326822}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12170753860127158, "exact_match_stderr,custom-extract": 0.009857844760249224}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10436713545521836, "exact_match_stderr,custom-extract": 0.008321085955307983}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.15476190476190477, "exact_match_stderr,custom-extract": 0.011904761904761823}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1462925851703407, "exact_match_stderr,custom-extract": 0.01583620126390544}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12548113933795227, "exact_match_stderr,custom-extract": 0.009194676853537979}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.20426065162907267, "exact_match_stderr,custom-extract": 0.014280670962640848}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.254, "acc_stderr,none": 0.019486596801643375, "acc_norm,none": 0.36, "acc_norm_stderr,none": 0.02148775108972052}, "piqa": {"alias": "piqa", "acc,none": 0.6860718171926007, "acc_stderr,none": 0.010827928134189646, "acc_norm,none": 0.6926006528835691, "acc_norm_stderr,none": 0.010765602506939063}, "race": {"alias": "race", "acc,none": 0.3598086124401914, "acc_stderr,none": 0.014853898144597874}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40736949846468784, "acc_stderr,none": 0.011118216651888712}, "winogrande": {"alias": "winogrande", "acc,none": 0.5990528808208366, "acc_stderr,none": 0.013773974554948028}}