{"created_at": "2025-04-13T05:48:39.601870", "global_step": 2000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.19880546075085323, "acc_stderr,none": 0.011662850198175539, "acc_norm,none": 0.24146757679180889, "acc_norm_stderr,none": 0.01250656483973943}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.4276094276094276, "acc_stderr,none": 0.01015168339743068, "acc_norm,none": 0.4158249158249158, "acc_norm_stderr,none": 0.010113348244647869}, "boolq": {"alias": "boolq", "acc,none": 0.5672782874617737, "acc_stderr,none": 0.008665526684416253}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1981981981981982, "acc_stderr,none": 0.011413095456219316}, "copa": {"alias": "copa", "acc,none": 0.6, "acc_stderr,none": 0.04923659639173309}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.2819159529974109, "acc_stderr,none": 0.00449013069102043, "acc_norm,none": 0.30302728540131446, "acc_norm_stderr,none": 0.004586276903267095}, "mmlu": {"acc,none": 0.23237430565446518, "acc_stderr,none": 0.003559785783568522, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24293304994686504, "acc_stderr,none": 0.00625187278509765, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848878}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.033175059300091805}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.0313217980308329}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507416}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.02335736578587403}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18971061093247588, "acc_stderr,none": 0.022268196258783218}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.022779719088733396}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2470664928292047, "acc_stderr,none": 0.01101575225527933}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824565}, "mmlu_other": {"acc,none": 0.24203411651110396, "acc_stderr,none": 0.007668433769463873, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.031265112061730424}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036624}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.040580420156460344}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.029872577708891165}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24776500638569604, "acc_stderr,none": 0.01543808308056897}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.02355083135199509}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2198581560283688, "acc_stderr,none": 0.024706141070705474}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.02388688192244033}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.03484331592680588}, "mmlu_social_sciences": {"acc,none": 0.22391940201494961, "acc_stderr,none": 0.007514963662349423, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20202020202020202, "acc_stderr,none": 0.028606204289229872}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.02925282329180362}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551975}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1981651376146789, "acc_stderr,none": 0.01709057380421788}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.038808483010823944}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24673202614379086, "acc_stderr,none": 0.017440820367402507}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.025801283475090506}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916714}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.21535045987947987, "acc_stderr,none": 0.007308040683485457, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19736842105263158, "acc_stderr,none": 0.03238981601699397}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036624}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.04158307533083287}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21164021164021163, "acc_stderr,none": 0.021037331505262893}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1870967741935484, "acc_stderr,none": 0.02218571009225225}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15763546798029557, "acc_stderr,none": 0.025639014131172404}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.18543046357615894, "acc_stderr,none": 0.031732843842942865}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.057596409574468085, "exact_match_stderr,custom-extract": 0.0021064374340498083, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.04184100418410042, "exact_match_stderr,custom-extract": 0.007482786069672008}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.053231939163498096, "exact_match_stderr,custom-extract": 0.007997326632665017}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.01060070671378092, "exact_match_stderr,custom-extract": 0.003045242659743122}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07560975609756097, "exact_match_stderr,custom-extract": 0.013072388347810094}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.09123222748815166, "exact_match_stderr,custom-extract": 0.009917148367281418}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.03199174406604747, "exact_match_stderr,custom-extract": 0.005656148594146956}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.09046454767726161, "exact_match_stderr,custom-extract": 0.01003547644019218}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.047244094488188976, "exact_match_stderr,custom-extract": 0.010883605491044059}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.019981834695731154, "exact_match_stderr,custom-extract": 0.004219280526552914}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07031828275351591, "exact_match_stderr,custom-extract": 0.006958800549270504}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.08441558441558442, "exact_match_stderr,custom-extract": 0.00915081259221816}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13026052104208416, "exact_match_stderr,custom-extract": 0.015082951205521087}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07390300230946882, "exact_match_stderr,custom-extract": 0.007261426284059207}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.02756892230576441, "exact_match_stderr,custom-extract": 0.005799761690758459}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.166, "acc_stderr,none": 0.016656616876531142, "acc_norm,none": 0.264, "acc_norm_stderr,none": 0.019732885585922087}, "piqa": {"alias": "piqa", "acc,none": 0.5750816104461371, "acc_stderr,none": 0.011533547946654765, "acc_norm,none": 0.5756256800870512, "acc_norm_stderr,none": 0.01153161275887105}, "race": {"alias": "race", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.013783600792343722}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.34953940634595704, "acc_stderr,none": 0.010789652212087029}, "winogrande": {"alias": "winogrande", "acc,none": 0.4964483030781373, "acc_stderr,none": 0.014052131146915853}} {"created_at": "2025-04-13T07:33:34.666441", "global_step": 4000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2175767918088737, "acc_stderr,none": 0.012057262020972494, "acc_norm,none": 0.24744027303754265, "acc_norm_stderr,none": 0.012610352663292673}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.49242424242424243, "acc_stderr,none": 0.010258605792153321, "acc_norm,none": 0.45875420875420875, "acc_norm_stderr,none": 0.01022481573025582}, "boolq": {"alias": "boolq", "acc,none": 0.5648318042813456, "acc_stderr,none": 0.008671229580582111}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2153972153972154, "acc_stderr,none": 0.01176969068622697}, "copa": {"alias": "copa", "acc,none": 0.66, "acc_stderr,none": 0.04760952285695238}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.30561641107349136, "acc_stderr,none": 0.004597265399568744, "acc_norm,none": 0.3463453495319657, "acc_norm_stderr,none": 0.004748324319714256}, "mmlu": {"acc,none": 0.24006551773251675, "acc_stderr,none": 0.003598051790375595, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24314558979808715, "acc_stderr,none": 0.0062573069037789465, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.038095238095238126}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.033464098810559534}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.029178682304842544}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302872}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.1901840490797546, "acc_stderr,none": 0.03083349114628124}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.023357365785874037}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19935691318327975, "acc_stderr,none": 0.022691033780549656}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24382716049382716, "acc_stderr,none": 0.023891879541959617}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2503259452411995, "acc_stderr,none": 0.011064151027165433}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946956}, "mmlu_other": {"acc,none": 0.2565175410363695, "acc_stderr,none": 0.007801561938651864, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106748}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.1791907514450867, "acc_stderr,none": 0.029242513059063273}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3721973094170404, "acc_stderr,none": 0.03244305283008731}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674047}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.015982814774695625}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888142}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.0258921511567094}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.0355092018568963}, "mmlu_social_sciences": {"acc,none": 0.2333441663958401, "acc_stderr,none": 0.007620264116280758, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.030532892233932022}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.02951928261681725}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.02102067268082791}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.026841514322958948}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23669724770642203, "acc_stderr,none": 0.018224078117299074}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2748091603053435, "acc_stderr,none": 0.03915345408847836}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2434640522875817, "acc_stderr,none": 0.01736247376214661}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.16326530612244897, "acc_stderr,none": 0.023661699177098608}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_stem": {"acc,none": 0.22581668252457976, "acc_stderr,none": 0.00742793910561649, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502652}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.03279000406310052}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036625}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364395}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2936170212765957, "acc_stderr,none": 0.02977164271249123}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.1793103448275862, "acc_stderr,none": 0.03196766433373187}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24338624338624337, "acc_stderr,none": 0.022101128787415426}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23548387096774193, "acc_stderr,none": 0.02413763242933771}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.21182266009852216, "acc_stderr,none": 0.028748983689941065}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.027634907264178544}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.17218543046357615, "acc_stderr,none": 0.030826136961962375}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.125, "acc_stderr,none": 0.022554842722407934}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.20535714285714285, "acc_stderr,none": 0.03834241021419072}, "mmlu_pro": {"exact_match,custom-extract": 0.08494015957446809, "exact_match_stderr,custom-extract": 0.0025334639259665262, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.100418410041841, "exact_match_stderr,custom-extract": 0.011232345114394717}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.07984790874524715, "exact_match_stderr,custom-extract": 0.009656027037033416}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.04946996466431095, "exact_match_stderr,custom-extract": 0.006447961264949632}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.04878048780487805, "exact_match_stderr,custom-extract": 0.010651279732379514}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.08886255924170616, "exact_match_stderr,custom-extract": 0.009800259323482094}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07017543859649122, "exact_match_stderr,custom-extract": 0.008210231372398015}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12469437652811736, "exact_match_stderr,custom-extract": 0.011558254824072248}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11811023622047244, "exact_match_stderr,custom-extract": 0.016556141198042416}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08628519527702089, "exact_match_stderr,custom-extract": 0.008465977919833964}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05847520355292376, "exact_match_stderr,custom-extract": 0.00638608754753002}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09090909090909091, "exact_match_stderr,custom-extract": 0.009462512458958352}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1462925851703407, "exact_match_stderr,custom-extract": 0.01583620126390544}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08622016936104696, "exact_match_stderr,custom-extract": 0.0077909043682681724}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.09774436090225563, "exact_match_stderr,custom-extract": 0.010519170574646106}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.17, "acc_stderr,none": 0.01681563353139343, "acc_norm,none": 0.294, "acc_norm_stderr,none": 0.02039509548493661}, "piqa": {"alias": "piqa", "acc,none": 0.6044613710554951, "acc_stderr,none": 0.011408384494565282, "acc_norm,none": 0.6120783460282916, "acc_norm_stderr,none": 0.011368965300027381}, "race": {"alias": "race", "acc,none": 0.3004784688995215, "acc_stderr,none": 0.014189169370361522}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.36284544524053225, "acc_stderr,none": 0.010880080590947026}, "winogrande": {"alias": "winogrande", "acc,none": 0.5098658247829518, "acc_stderr,none": 0.014049749833367589}} {"created_at": "2025-04-13T09:22:33.960735", "global_step": 6000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2167235494880546, "acc_stderr,none": 0.012040156713481192, "acc_norm,none": 0.25341296928327645, "acc_norm_stderr,none": 0.012710896778378602}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.4583333333333333, "acc_stderr,none": 0.010224097209176596, "acc_norm,none": 0.4377104377104377, "acc_norm_stderr,none": 0.010179856486006902}, "boolq": {"alias": "boolq", "acc,none": 0.5437308868501529, "acc_stderr,none": 0.008711542845585772}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21375921375921375, "acc_stderr,none": 0.011737086112127208}, "copa": {"alias": "copa", "acc,none": 0.61, "acc_stderr,none": 0.04902071300001975}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.31905994821748657, "acc_stderr,none": 0.004651597209993096, "acc_norm,none": 0.37064329814777935, "acc_norm_stderr,none": 0.00481989994534249}, "mmlu": {"acc,none": 0.24647486113089304, "acc_stderr,none": 0.0036373401684703407, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24442082890541977, "acc_stderr,none": 0.006268202825586403, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30952380952380953, "acc_stderr,none": 0.04134913018303316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.033744026441394036}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693264}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22784810126582278, "acc_stderr,none": 0.02730348459906944}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.040191074725573483}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.032262193772867744}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23410404624277456, "acc_stderr,none": 0.022797110278071138}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2558659217877095, "acc_stderr,none": 0.014593620923210761}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21221864951768488, "acc_stderr,none": 0.023222756797435126}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22839506172839505, "acc_stderr,none": 0.023358211840626267}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24771838331160365, "acc_stderr,none": 0.011025499291443737}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.26900584795321636, "acc_stderr,none": 0.0340105262010409}, "mmlu_other": {"acc,none": 0.24943675571290633, "acc_stderr,none": 0.007755044191539503, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2528301886792453, "acc_stderr,none": 0.02674989977124123}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.033687629322594316}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.14, "acc_stderr,none": 0.03487350880197771}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2825112107623318, "acc_stderr,none": 0.03021683101150878}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646034}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.31196581196581197, "acc_stderr,none": 0.030351527323344948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23116219667943805, "acc_stderr,none": 0.015075523238101084}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.024518195641879334}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.025518731049537766}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22794117647058823, "acc_stderr,none": 0.025483081468029804}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.034843315926805875}, "mmlu_social_sciences": {"acc,none": 0.24699382515437113, "acc_stderr,none": 0.007780461919541196, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.042270544512322}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.26262626262626265, "acc_stderr,none": 0.03135305009533084}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24870466321243523, "acc_stderr,none": 0.031195840877700293}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2358974358974359, "acc_stderr,none": 0.021525965407408726}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2773109243697479, "acc_stderr,none": 0.029079374539480007}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24036697247706423, "acc_stderr,none": 0.01832060732096407}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.29770992366412213, "acc_stderr,none": 0.04010358942462203}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.016906615927288142}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22040816326530613, "acc_stderr,none": 0.0265370453121453}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.030965903123573023}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.24611481129083412, "acc_stderr,none": 0.007672548219304254, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.036333844140734636}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.26973684210526316, "acc_stderr,none": 0.03611780560284898}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.046482319871173156}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.04617034827006718}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2425531914893617, "acc_stderr,none": 0.028020226271200217}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525218}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22580645161290322, "acc_stderr,none": 0.023785577884181012}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.19704433497536947, "acc_stderr,none": 0.02798672466673621}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.15, "acc_stderr,none": 0.03588702812826371}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.025348097468097845}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.035433042343899844}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.24537037037037038, "acc_stderr,none": 0.029346665094372937}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.04327040932578728}, "mmlu_pro": {"exact_match,custom-extract": 0.056432845744680854, "exact_match_stderr,custom-extract": 0.0020926241667861946, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.05439330543933055, "exact_match_stderr,custom-extract": 0.008475620127170813}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.060836501901140684, "exact_match_stderr,custom-extract": 0.008515097217967529}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.027385159010600707, "exact_match_stderr,custom-extract": 0.0048528489883542525}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.036585365853658534, "exact_match_stderr,custom-extract": 0.009283220509573744}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.07109004739336493, "exact_match_stderr,custom-extract": 0.008850695970581643}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.03199174406604747, "exact_match_stderr,custom-extract": 0.005656148594146955}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.08924205378973105, "exact_match_stderr,custom-extract": 0.009974134825348273}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.05249343832020997, "exact_match_stderr,custom-extract": 0.011440679641839065}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.027247956403269755, "exact_match_stderr,custom-extract": 0.004908758894717502}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.03478904515173945, "exact_match_stderr,custom-extract": 0.004987299981261299}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.07251082251082251, "exact_match_stderr,custom-extract": 0.008536011481644207}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12224448897795591, "exact_match_stderr,custom-extract": 0.014678671649386738}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07467282525019246, "exact_match_stderr,custom-extract": 0.0072961138747172204}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.07518796992481203, "exact_match_stderr,custom-extract": 0.009340535041418843}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.184, "acc_stderr,none": 0.01734617478175285, "acc_norm,none": 0.3, "acc_norm_stderr,none": 0.02051442622562805}, "piqa": {"alias": "piqa", "acc,none": 0.6186071817192601, "acc_stderr,none": 0.01133285040652868, "acc_norm,none": 0.6142546245919478, "acc_norm_stderr,none": 0.011357166777524042}, "race": {"alias": "race", "acc,none": 0.29473684210526313, "acc_stderr,none": 0.014110505176373047}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.38485158648925283, "acc_stderr,none": 0.011009953204615657}, "winogrande": {"alias": "winogrande", "acc,none": 0.5295974743488555, "acc_stderr,none": 0.014027843827840086}} {"created_at": "2025-04-13T10:02:38.751447", "global_step": 8000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.23976109215017063, "acc_stderr,none": 0.012476304127453947, "acc_norm,none": 0.2713310580204778, "acc_norm_stderr,none": 0.012993807727545789}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5164141414141414, "acc_stderr,none": 0.010254253565929305, "acc_norm,none": 0.49158249158249157, "acc_norm_stderr,none": 0.010258329515226454}, "boolq": {"alias": "boolq", "acc,none": 0.42018348623853213, "acc_stderr,none": 0.008632912118872548}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18591318591318592, "acc_stderr,none": 0.0111380853498107}, "copa": {"alias": "copa", "acc,none": 0.59, "acc_stderr,none": 0.04943110704237101}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.32613025293766185, "acc_stderr,none": 0.004678375103797963, "acc_norm,none": 0.3828918542123083, "acc_norm_stderr,none": 0.004850988215167547}, "mmlu": {"acc,none": 0.23864121919954423, "acc_stderr,none": 0.0035938931788170735, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2463336875664187, "acc_stderr,none": 0.0062805373279316585, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.03764950879790605}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.18787878787878787, "acc_stderr,none": 0.03050193405942914}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.029554292605695053}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.029041333510598035}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516302}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.1901840490797546, "acc_stderr,none": 0.03083349114628124}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.023532925431044276}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808862}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22186495176848875, "acc_stderr,none": 0.023598858292863047}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.023468429832451152}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2542372881355932, "acc_stderr,none": 0.011121129007840671}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.2481493401995494, "acc_stderr,none": 0.007725828200315308, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2528301886792453, "acc_stderr,none": 0.026749899771241235}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3094170403587444, "acc_stderr,none": 0.03102441174057221}, "mmlu_management": {"alias": " - management", "acc,none": 0.1650485436893204, "acc_stderr,none": 0.03675668832233188}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.030572811310299614}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23754789272030652, "acc_stderr,none": 0.015218733046150191}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.02405102973991226}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.025518731049537762}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.023157468308559352}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064536}, "mmlu_social_sciences": {"acc,none": 0.2170945726356841, "acc_stderr,none": 0.007435687247699474, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.02912652283458682}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.18717948717948718, "acc_stderr,none": 0.019776601086550036}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.027025433498882385}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21834862385321102, "acc_stderr,none": 0.017712600528722713}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596917}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.017401816711427657}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.041220665028782834}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.17142857142857143, "acc_stderr,none": 0.02412746346265015}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22388059701492538, "acc_stderr,none": 0.02947525023601717}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_stem": {"acc,none": 0.23882017126546146, "acc_stderr,none": 0.0075977297849925554, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.17777777777777778, "acc_stderr,none": 0.03302789859901718}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.0355418036802569}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2152777777777778, "acc_stderr,none": 0.03437079344106135}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171452}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20851063829787234, "acc_stderr,none": 0.02655698211783873}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.03664666337225256}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.022717467897708617}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25483870967741934, "acc_stderr,none": 0.024790118459332208}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.031447125816782426}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.025348097468097863}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360385}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.17592592592592593, "acc_stderr,none": 0.02596742095825853}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467764}, "mmlu_pro": {"exact_match,custom-extract": 0.0577626329787234, "exact_match_stderr,custom-extract": 0.0021172747265495637, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.03626220362622036, "exact_match_stderr,custom-extract": 0.0069863450429537574}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.05449936628643853, "exact_match_stderr,custom-extract": 0.00808655471812044}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.02561837455830389, "exact_match_stderr,custom-extract": 0.00469795687511461}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.05365853658536585, "exact_match_stderr,custom-extract": 0.011142475619285143}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.06516587677725119, "exact_match_stderr,custom-extract": 0.008500874651887182}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.04540763673890609, "exact_match_stderr,custom-extract": 0.006691689003512086}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.06968215158924206, "exact_match_stderr,custom-extract": 0.008907696762609756}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10761154855643044, "exact_match_stderr,custom-extract": 0.015896979452723385}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.03905540417801998, "exact_match_stderr,custom-extract": 0.005841083304141354}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.043671354552183565, "exact_match_stderr,custom-extract": 0.005562051802584955}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.08008658008658008, "exact_match_stderr,custom-extract": 0.008934134801706002}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13026052104208416, "exact_match_stderr,custom-extract": 0.015082951205521087}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.06466512702078522, "exact_match_stderr,custom-extract": 0.006826235777552132}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.06641604010025062, "exact_match_stderr,custom-extract": 0.00882031345147371}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.204, "acc_stderr,none": 0.018039369104138673, "acc_norm,none": 0.294, "acc_norm_stderr,none": 0.020395095484936614}, "piqa": {"alias": "piqa", "acc,none": 0.6180631120783461, "acc_stderr,none": 0.011335942557505226, "acc_norm,none": 0.6011969532100109, "acc_norm_stderr,none": 0.01142439054503728}, "race": {"alias": "race", "acc,none": 0.2966507177033493, "acc_stderr,none": 0.014137023394252773}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.37308085977482086, "acc_stderr,none": 0.010943497107992355}, "winogrande": {"alias": "winogrande", "acc,none": 0.5059194948697711, "acc_stderr,none": 0.01405150083848581}} {"created_at": "2025-04-13T14:06:39.864559", "global_step": 12000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2295221843003413, "acc_stderr,none": 0.012288926760890785, "acc_norm,none": 0.26706484641638223, "acc_norm_stderr,none": 0.012928933196496349}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5387205387205387, "acc_stderr,none": 0.010228972678389608, "acc_norm,none": 0.4957912457912458, "acc_norm_stderr,none": 0.010259420038764079}, "boolq": {"alias": "boolq", "acc,none": 0.5510703363914373, "acc_stderr,none": 0.008699318031464162}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20393120393120392, "acc_stderr,none": 0.011535521334313655}, "copa": {"alias": "copa", "acc,none": 0.63, "acc_stderr,none": 0.048523658709391}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3403704441346345, "acc_stderr,none": 0.004728653488866913, "acc_norm,none": 0.41455885281816374, "acc_norm_stderr,none": 0.004916388962142324}, "mmlu": {"acc,none": 0.23750178037316622, "acc_stderr,none": 0.003587636618916815, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2363443145589798, "acc_stderr,none": 0.006198317519406024, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604675}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.03192271569548299}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.029554292605695066}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460288}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.04026187527591207}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094634}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615767}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.023445826276545553}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.01426555419233115}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21543408360128619, "acc_stderr,none": 0.023350225475471414}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22530864197530864, "acc_stderr,none": 0.023246202647819757}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2288135593220339, "acc_stderr,none": 0.010728759090375493}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.27485380116959063, "acc_stderr,none": 0.03424042924691584}, "mmlu_other": {"acc,none": 0.2597360798197618, "acc_stderr,none": 0.007855906729770785, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.026341480371118355}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.03063114553919882}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.02999695185834948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.01598281477469563}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888125}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2127659574468085, "acc_stderr,none": 0.024414612974307706}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.25, "acc_stderr,none": 0.026303648393696036}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511115}, "mmlu_social_sciences": {"acc,none": 0.2216444588885278, "acc_stderr,none": 0.007489322348253162, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748142}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2474747474747475, "acc_stderr,none": 0.030746300742124505}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.029519282616817244}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.19487179487179487, "acc_stderr,none": 0.020083167595181393}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279476}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23669724770642203, "acc_stderr,none": 0.01822407811729907}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.035954616117746904}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.01703522925803403}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.1673469387755102, "acc_stderr,none": 0.02389714476891452}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.030965903123573023}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653697}, "mmlu_stem": {"acc,none": 0.2327941642879797, "acc_stderr,none": 0.007509256204312296, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.03547854198560826}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.037150621549989056}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03800968060554858}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774708}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816508}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292333}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.03752833958003337}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2328042328042328, "acc_stderr,none": 0.02176596167215453}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22903225806451613, "acc_stderr,none": 0.02390491431178265}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1921182266009852, "acc_stderr,none": 0.027719315709614778}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.026842057873833706}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.0347918557259966}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.14351851851851852, "acc_stderr,none": 0.023910779252644378}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.044328040552915185}, "mmlu_pro": {"exact_match,custom-extract": 0.06665558510638298, "exact_match_stderr,custom-extract": 0.002261409710095081, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.03207810320781032, "exact_match_stderr,custom-extract": 0.006585185093286935}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.07224334600760456, "exact_match_stderr,custom-extract": 0.009222589030908395}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0353356890459364, "exact_match_stderr,custom-extract": 0.005489889277966926}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.03170731707317073, "exact_match_stderr,custom-extract": 0.008664059354131415}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.08412322274881516, "exact_match_stderr,custom-extract": 0.0095601055538458}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.04643962848297214, "exact_match_stderr,custom-extract": 0.0067636447175332785}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10757946210268948, "exact_match_stderr,custom-extract": 0.01084022843641748}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09186351706036745, "exact_match_stderr,custom-extract": 0.014816829983934023}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.043596730245231606, "exact_match_stderr,custom-extract": 0.0061567446650111}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.043671354552183565, "exact_match_stderr,custom-extract": 0.00556205180258495}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10497835497835498, "exact_match_stderr,custom-extract": 0.010089410685404702}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10020040080160321, "exact_match_stderr,custom-extract": 0.013455286690416086}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07236335642802155, "exact_match_stderr,custom-extract": 0.007191358722321197}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10275689223057644, "exact_match_stderr,custom-extract": 0.010755519334168297}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.194, "acc_stderr,none": 0.017701827855304615, "acc_norm,none": 0.306, "acc_norm_stderr,none": 0.020629569998345407}, "piqa": {"alias": "piqa", "acc,none": 0.6327529923830251, "acc_stderr,none": 0.011247128539690562, "acc_norm,none": 0.6229597388465724, "acc_norm_stderr,none": 0.011307569752543897}, "race": {"alias": "race", "acc,none": 0.3129186602870813, "acc_stderr,none": 0.014350583456012766}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.38485158648925283, "acc_stderr,none": 0.011009953204615655}, "winogrande": {"alias": "winogrande", "acc,none": 0.5564325177584846, "acc_stderr,none": 0.013962694907620402}} {"created_at": "2025-04-13T15:43:38.482225", "global_step": 14000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.25, "acc_stderr,none": 0.012653835621466646, "acc_norm,none": 0.28668941979522183, "acc_norm_stderr,none": 0.013214986329274763}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5307239057239057, "acc_stderr,none": 0.010240395584815236, "acc_norm,none": 0.4983164983164983, "acc_norm_stderr,none": 0.010259725364582766}, "boolq": {"alias": "boolq", "acc,none": 0.6241590214067279, "acc_stderr,none": 0.008471147248160105}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1981981981981982, "acc_stderr,none": 0.011413095456219316}, "copa": {"alias": "copa", "acc,none": 0.6, "acc_stderr,none": 0.04923659639173309}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.34465245966938857, "acc_stderr,none": 0.004742835309763665, "acc_norm,none": 0.4133638717386975, "acc_norm_stderr,none": 0.004914305798575695}, "mmlu": {"acc,none": 0.2570146702748896, "acc_stderr,none": 0.0036840459884698563, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24888416578108397, "acc_stderr,none": 0.006298333339723462, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.03809523809523811}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.20606060606060606, "acc_stderr,none": 0.031584153240477086}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.03256685484460388}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.20253164556962025, "acc_stderr,none": 0.02616056824660146}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2809917355371901, "acc_stderr,none": 0.04103203830514512}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.0401910747255735}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2630057803468208, "acc_stderr,none": 0.023703099525258172}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2379421221864952, "acc_stderr,none": 0.024185150647818707}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3117283950617284, "acc_stderr,none": 0.02577311116963045}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2379400260756193, "acc_stderr,none": 0.010875700787694242}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.2919214676536852, "acc_stderr,none": 0.00815275549246651, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.027495663683724057}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.03391750322321659}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2600896860986547, "acc_stderr,none": 0.029442495585857483}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.04541609446503949}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.02961432369045665}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.42, "acc_stderr,none": 0.049604496374885836}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2681992337164751, "acc_stderr,none": 0.01584243083526942}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.31699346405228757, "acc_stderr,none": 0.026643278474508755}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.02668456434046099}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3161764705882353, "acc_stderr,none": 0.028245687391462923}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.036293353299478616}, "mmlu_social_sciences": {"acc,none": 0.24764380890477738, "acc_stderr,none": 0.007784225743827877, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281336}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03173071239071724}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.022139081103971527}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176896}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26605504587155965, "acc_stderr,none": 0.018946022322225593}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.20610687022900764, "acc_stderr,none": 0.035477710041594626}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.017593486895366835}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.041220665028782834}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2163265306122449, "acc_stderr,none": 0.026358916334904028}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2835820895522388, "acc_stderr,none": 0.031871875379197966}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_stem": {"acc,none": 0.24389470345702505, "acc_stderr,none": 0.007655522730449062, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.0391545063041425}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21710526315789475, "acc_stderr,none": 0.03355045304882924}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.042801058373643966}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2425531914893617, "acc_stderr,none": 0.028020226271200217}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.02271746789770862}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23225806451612904, "acc_stderr,none": 0.02402225613030824}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.030108330718011625}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.02696242432507382}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18055555555555555, "acc_stderr,none": 0.02623287897149166}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "mmlu_pro": {"exact_match,custom-extract": 0.06599069148936171, "exact_match_stderr,custom-extract": 0.002253410931361896, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.02789400278940028, "exact_match_stderr,custom-extract": 0.006153974892713407}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08745247148288973, "exact_match_stderr,custom-extract": 0.010063537786722953}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.04858657243816254, "exact_match_stderr,custom-extract": 0.006393099549827181}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.036585365853658534, "exact_match_stderr,custom-extract": 0.009283220509573744}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.0924170616113744, "exact_match_stderr,custom-extract": 0.009974828838692067}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.03715170278637771, "exact_match_stderr,custom-extract": 0.006078978628329618}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1100244498777506, "exact_match_stderr,custom-extract": 0.01094769305560387}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.06824146981627296, "exact_match_stderr,custom-extract": 0.012935525502883788}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.04632152588555858, "exact_match_stderr,custom-extract": 0.0063371804832628815}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.04441154700222058, "exact_match_stderr,custom-extract": 0.005606818698895929}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.08982683982683982, "exact_match_stderr,custom-extract": 0.009411616498072215}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.07615230460921844, "exact_match_stderr,custom-extract": 0.011885762390957635}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07852193995381063, "exact_match_stderr,custom-extract": 0.007466217955194465}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.08897243107769423, "exact_match_stderr,custom-extract": 0.010084731218347916}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.206, "acc_stderr,none": 0.018104794037333543, "acc_norm,none": 0.324, "acc_norm_stderr,none": 0.02095055731247745}, "piqa": {"alias": "piqa", "acc,none": 0.6490750816104461, "acc_stderr,none": 0.011135250564776792, "acc_norm,none": 0.6436343852013058, "acc_norm_stderr,none": 0.011174109865864727}, "race": {"alias": "race", "acc,none": 0.29952153110047847, "acc_stderr,none": 0.014176243669813222}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.3915046059365404, "acc_stderr,none": 0.011044497566951847}, "winogrande": {"alias": "winogrande", "acc,none": 0.5130228887134964, "acc_stderr,none": 0.01404771839399767}} {"created_at": "2025-04-13T17:29:06.878964", "global_step": 16000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2551194539249147, "acc_stderr,none": 0.012739038695202109, "acc_norm,none": 0.29948805460750855, "acc_norm_stderr,none": 0.013385021637313569}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5627104377104377, "acc_stderr,none": 0.010178768429321599, "acc_norm,none": 0.5374579124579124, "acc_norm_stderr,none": 0.010230952104570805}, "boolq": {"alias": "boolq", "acc,none": 0.5724770642201835, "acc_stderr,none": 0.00865269299717734}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2113022113022113, "acc_stderr,none": 0.011687655909401368}, "copa": {"alias": "copa", "acc,none": 0.65, "acc_stderr,none": 0.047937248544110196}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.34534953196574386, "acc_stderr,none": 0.004745103543901294, "acc_norm,none": 0.42620991834295957, "acc_norm_stderr,none": 0.004935143791573816}, "mmlu": {"acc,none": 0.2388548639794901, "acc_stderr,none": 0.00359314116061559, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23188097768331561, "acc_stderr,none": 0.006151660863469591, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.0361960452412425}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.19393939393939394, "acc_stderr,none": 0.0308741451365621}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.20098039215686275, "acc_stderr,none": 0.028125972265654362}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2869198312236287, "acc_stderr,none": 0.029443773022594693}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.04026187527591207}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.042365112580946336}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.033220157957767414}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23128491620111732, "acc_stderr,none": 0.014102223623152603}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18006430868167203, "acc_stderr,none": 0.02182342285774494}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22839506172839505, "acc_stderr,none": 0.023358211840626267}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23989569752281617, "acc_stderr,none": 0.010906282617981648}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2046783625730994, "acc_stderr,none": 0.03094445977853322}, "mmlu_other": {"acc,none": 0.2645638879948503, "acc_stderr,none": 0.00789396406559993, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.02634148037111836}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.32286995515695066, "acc_stderr,none": 0.03138147637575499}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3076923076923077, "acc_stderr,none": 0.030236389942173095}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2886334610472541, "acc_stderr,none": 0.01620379270319779}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.19934640522875818, "acc_stderr,none": 0.02287581699346407}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.026469036818590634}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.0355092018568963}, "mmlu_social_sciences": {"acc,none": 0.23399415014624633, "acc_stderr,none": 0.00762742375916705, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748142}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18686868686868688, "acc_stderr,none": 0.02777253333421899}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.02951928261681725}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21794871794871795, "acc_stderr,none": 0.020932445774463185}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.18907563025210083, "acc_stderr,none": 0.02543511943810535}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24220183486238533, "acc_stderr,none": 0.01836817630659862}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2748091603053435, "acc_stderr,none": 0.03915345408847835}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.017555818091322277}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.04494290866252088}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.025801283475090506}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.030769444967296018}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.22867110688233427, "acc_stderr,none": 0.007475704446005996, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.03853254836552003}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17105263157894737, "acc_stderr,none": 0.0306436070716771}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179963}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.029379170464124818}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21693121693121692, "acc_stderr,none": 0.02122708244944506}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22903225806451613, "acc_stderr,none": 0.023904914311782648}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22167487684729065, "acc_stderr,none": 0.029225575892489596}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.027195934804085622}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16203703703703703, "acc_stderr,none": 0.02513045365226846}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.04246624336697624}, "mmlu_pro": {"exact_match,custom-extract": 0.0680684840425532, "exact_match_stderr,custom-extract": 0.002282400160542255, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.03347280334728033, "exact_match_stderr,custom-extract": 0.006721970022421937}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.07858048162230671, "exact_match_stderr,custom-extract": 0.00958568025240835}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.022084805653710248, "exact_match_stderr,custom-extract": 0.004369845531279022}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.041463414634146344, "exact_match_stderr,custom-extract": 0.00985769155729108}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10426540284360189, "exact_match_stderr,custom-extract": 0.010525579113489905}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.05675954592363261, "exact_match_stderr,custom-extract": 0.007436917896532636}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10268948655256724, "exact_match_stderr,custom-extract": 0.010619971250260027}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.08923884514435695, "exact_match_stderr,custom-extract": 0.014624715351405176}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07720254314259764, "exact_match_stderr,custom-extract": 0.008047716247195585}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.033308660251665435, "exact_match_stderr,custom-extract": 0.004883774603036287}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10173160173160173, "exact_match_stderr,custom-extract": 0.009950161991784666}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10821643286573146, "exact_match_stderr,custom-extract": 0.013920719044718395}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.06235565819861432, "exact_match_stderr,custom-extract": 0.006711500954409667}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.08897243107769423, "exact_match_stderr,custom-extract": 0.010084731218347904}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.204, "acc_stderr,none": 0.018039369104138645, "acc_norm,none": 0.316, "acc_norm_stderr,none": 0.020812359515855857}, "piqa": {"alias": "piqa", "acc,none": 0.6349292709466812, "acc_stderr,none": 0.01123302183055483, "acc_norm,none": 0.6349292709466812, "acc_norm_stderr,none": 0.011233021830554829}, "race": {"alias": "race", "acc,none": 0.3167464114832536, "acc_stderr,none": 0.014397814139910632}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.38382804503582396, "acc_stderr,none": 0.011004446266126441}, "winogrande": {"alias": "winogrande", "acc,none": 0.5414364640883977, "acc_stderr,none": 0.014004146853791906}} {"created_at": "2025-04-13T19:13:05.721922", "global_step": 18000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.25341296928327645, "acc_stderr,none": 0.012710896778378606, "acc_norm,none": 0.28071672354948807, "acc_norm_stderr,none": 0.013131238126975586}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5345117845117845, "acc_stderr,none": 0.010235314238969393, "acc_norm,none": 0.5151515151515151, "acc_norm_stderr,none": 0.010255071794531508}, "boolq": {"alias": "boolq", "acc,none": 0.5097859327217126, "acc_stderr,none": 0.008743379884697196}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18427518427518427, "acc_stderr,none": 0.01110006057934762}, "copa": {"alias": "copa", "acc,none": 0.63, "acc_stderr,none": 0.048523658709391}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.35022903804023103, "acc_stderr,none": 0.004760666311146294, "acc_norm,none": 0.4314877514439355, "acc_norm_stderr,none": 0.004942716091996071}, "mmlu": {"acc,none": 0.2319470160945734, "acc_stderr,none": 0.00355475388881071, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24208289054197663, "acc_stderr,none": 0.006236529769533711, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.03932537680392871}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.03019028245350195}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19834710743801653, "acc_stderr,none": 0.03640118271990946}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.03322015795776741}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.02335736578587403}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.01426555419233115}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.02240967454730419}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24511082138200782, "acc_stderr,none": 0.010986307870045514}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.36257309941520466, "acc_stderr,none": 0.0368713061556206}, "mmlu_other": {"acc,none": 0.2507241712262633, "acc_stderr,none": 0.007766426502840918, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.02544786382510861}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3004484304932735, "acc_stderr,none": 0.030769352008229132}, "mmlu_management": {"alias": " - management", "acc,none": 0.20388349514563106, "acc_stderr,none": 0.039891398595317706}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25287356321839083, "acc_stderr,none": 0.015543377313719681}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.025360603796242553}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.22695035460992907, "acc_stderr,none": 0.024987106365642973}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1801470588235294, "acc_stderr,none": 0.023345163616544838}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.21904452388690282, "acc_stderr,none": 0.007455785980475217, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.19696969696969696, "acc_stderr,none": 0.02833560973246335}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19170984455958548, "acc_stderr,none": 0.028408953626245282}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551972}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275886}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1963302752293578, "acc_stderr,none": 0.01703071933915436}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2565359477124183, "acc_stderr,none": 0.017667841612379}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.04172343038705383}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.025206963154225423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.029705284056772432}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_stem": {"acc,none": 0.21091024421186172, "acc_stderr,none": 0.007245564280545126, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.037125378336148665}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17105263157894737, "acc_stderr,none": 0.030643607071677098}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2152777777777778, "acc_stderr,none": 0.034370793441061344}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816508}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.028504856470514192}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20105820105820105, "acc_stderr,none": 0.02064181078237015}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.17096774193548386, "acc_stderr,none": 0.02141724293632156}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1477832512315271, "acc_stderr,none": 0.024969621333521284}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.025348097468097873}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1388888888888889, "acc_stderr,none": 0.023585447368900125}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291518}, "mmlu_pro": {"exact_match,custom-extract": 0.06831781914893617, "exact_match_stderr,custom-extract": 0.00228479393331031, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.0502092050209205, "exact_match_stderr,custom-extract": 0.008161108270402942}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.07604562737642585, "exact_match_stderr,custom-extract": 0.009442767082228949}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.019434628975265017, "exact_match_stderr,custom-extract": 0.004104829289279652}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.03414634146341464, "exact_match_stderr,custom-extract": 0.008979788338355143}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.08767772511848342, "exact_match_stderr,custom-extract": 0.009741032414423511}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.04024767801857585, "exact_match_stderr,custom-extract": 0.006317021020957948}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10513447432762836, "exact_match_stderr,custom-extract": 0.010731005913982409}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.05511811023622047, "exact_match_stderr,custom-extract": 0.011706959711417738}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.047229791099000905, "exact_match_stderr,custom-extract": 0.006395960225851721}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0540340488527017, "exact_match_stderr,custom-extract": 0.00615325089732228}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09956709956709957, "exact_match_stderr,custom-extract": 0.00985559287905707}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.015561893867712515}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07852193995381063, "exact_match_stderr,custom-extract": 0.007466217955194464}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10150375939849623, "exact_match_stderr,custom-extract": 0.010697198018805528}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.206, "acc_stderr,none": 0.018104794037333526, "acc_norm,none": 0.3, "acc_norm_stderr,none": 0.02051442622562805}, "piqa": {"alias": "piqa", "acc,none": 0.6349292709466812, "acc_stderr,none": 0.011233021830554829, "acc_norm,none": 0.6305767138193689, "acc_norm_stderr,none": 0.011260988628572336}, "race": {"alias": "race", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.014302215587018904}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.39662231320368474, "acc_stderr,none": 0.01106960369186044}, "winogrande": {"alias": "winogrande", "acc,none": 0.5603788476716653, "acc_stderr,none": 0.013949649776015694}} {"created_at": "2025-04-13T20:55:44.240699", "global_step": 20000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.26109215017064846, "acc_stderr,none": 0.012835523909473848, "acc_norm,none": 0.29180887372013653, "acc_norm_stderr,none": 0.013284525292403508}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5631313131313131, "acc_stderr,none": 0.010177672928157688, "acc_norm,none": 0.5328282828282829, "acc_norm_stderr,none": 0.010237645778853856}, "boolq": {"alias": "boolq", "acc,none": 0.6113149847094801, "acc_stderr,none": 0.008525580498982973}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19901719901719903, "acc_stderr,none": 0.011430809442838401}, "copa": {"alias": "copa", "acc,none": 0.65, "acc_stderr,none": 0.047937248544110196}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3505277833100976, "acc_stderr,none": 0.004761601303258893, "acc_norm,none": 0.4298944433379805, "acc_norm_stderr,none": 0.004940490508240643}, "mmlu": {"acc,none": 0.2629255091867255, "acc_stderr,none": 0.0037124862877943384, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2454835281615303, "acc_stderr,none": 0.006272695253506488, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.038932596106046755}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.30303030303030304, "acc_stderr,none": 0.03588624800091707}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22058823529411764, "acc_stderr,none": 0.02910225438967409}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.23628691983122363, "acc_stderr,none": 0.027652153144159263}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19008264462809918, "acc_stderr,none": 0.035817969517092825}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094633}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.1901840490797546, "acc_stderr,none": 0.030833491146281252}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.27150837988826815, "acc_stderr,none": 0.014874252168095271}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22186495176848875, "acc_stderr,none": 0.023598858292863047}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23765432098765432, "acc_stderr,none": 0.023683591837008553}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23728813559322035, "acc_stderr,none": 0.01086543669078028}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.032744852119469564}, "mmlu_other": {"acc,none": 0.25587383327969104, "acc_stderr,none": 0.007833777572654707, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.25660377358490566, "acc_stderr,none": 0.026880647889051982}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.030631145539198826}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2825112107623318, "acc_stderr,none": 0.030216831011508773}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.04498676320572922}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674043}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2503192848020434, "acc_stderr,none": 0.01549108895149458}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.02495418432487991}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.0258921511567094}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.02576725201085597}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2289156626506024, "acc_stderr,none": 0.03270745277352477}, "mmlu_social_sciences": {"acc,none": 0.2892427689307767, "acc_stderr,none": 0.008170776789547285, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3383838383838384, "acc_stderr,none": 0.033711241426263014}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3316062176165803, "acc_stderr,none": 0.03397636541089116}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.28974358974358977, "acc_stderr,none": 0.023000628243687964}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2773109243697479, "acc_stderr,none": 0.02907937453948001}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.30091743119266057, "acc_stderr,none": 0.019664751366802114}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.038808483010823965}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.01784808957491323}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.34285714285714286, "acc_stderr,none": 0.030387262919547728}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23383084577114427, "acc_stderr,none": 0.029929415408348398}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_stem": {"acc,none": 0.2702188392007612, "acc_stderr,none": 0.007910888305181104, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.03885004245800255}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351586}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617748}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292326}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.022261817692400175}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.26129032258064516, "acc_stderr,none": 0.024993053397764815}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3251231527093596, "acc_stderr,none": 0.032957975663112704}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.02646611753895991}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.037101857261199946}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03099866630456053}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03894641120044792}, "mmlu_pro": {"exact_match,custom-extract": 0.07106050531914894, "exact_match_stderr,custom-extract": 0.002333614136715188, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.0599721059972106, "exact_match_stderr,custom-extract": 0.008873368328997315}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.07224334600760456, "exact_match_stderr,custom-extract": 0.009222589030908397}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.03180212014134275, "exact_match_stderr,custom-extract": 0.0052176963130372124}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.06341463414634146, "exact_match_stderr,custom-extract": 0.012050547403328619}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10781990521327015, "exact_match_stderr,custom-extract": 0.010682230633557247}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.04953560371517028, "exact_match_stderr,custom-extract": 0.006974112971215206}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1100244498777506, "exact_match_stderr,custom-extract": 0.010947693055603894}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09186351706036745, "exact_match_stderr,custom-extract": 0.014816829983934037}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.049954586739327886, "exact_match_stderr,custom-extract": 0.0065684594847464925}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05921539600296077, "exact_match_stderr,custom-extract": 0.006423852131454602}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.08982683982683982, "exact_match_stderr,custom-extract": 0.009411616498072215}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.09619238476953908, "exact_match_stderr,custom-extract": 0.013212763839442681}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.06543494996150885, "exact_match_stderr,custom-extract": 0.006863921514375268}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.09774436090225563, "exact_match_stderr,custom-extract": 0.010519170574646101}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.196, "acc_stderr,none": 0.017770751227744866, "acc_norm,none": 0.312, "acc_norm_stderr,none": 0.02074059653648809}, "piqa": {"alias": "piqa", "acc,none": 0.6371055495103374, "acc_stderr,none": 0.01121866757084088, "acc_norm,none": 0.6327529923830251, "acc_norm_stderr,none": 0.011247128539690556}, "race": {"alias": "race", "acc,none": 0.30526315789473685, "acc_stderr,none": 0.014252698955501592}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.38843398157625386, "acc_stderr,none": 0.011028822814998104}, "winogrande": {"alias": "winogrande", "acc,none": 0.5430149960536701, "acc_stderr,none": 0.01400038676159829}} {"created_at": "2025-04-13T22:43:56.053335", "global_step": 22000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.26791808873720135, "acc_stderr,none": 0.012942030195136432, "acc_norm,none": 0.29692832764505117, "acc_norm_stderr,none": 0.013352025976725223}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5547138047138047, "acc_stderr,none": 0.01019817113787387, "acc_norm,none": 0.5210437710437711, "acc_norm_stderr,none": 0.010250692602022571}, "boolq": {"alias": "boolq", "acc,none": 0.5259938837920489, "acc_stderr,none": 0.008733229228168141}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1891891891891892, "acc_stderr,none": 0.011213159711868613}, "copa": {"alias": "copa", "acc,none": 0.62, "acc_stderr,none": 0.04878317312145633}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.35530770762796254, "acc_stderr,none": 0.004776283203468098, "acc_norm,none": 0.4342760406293567, "acc_norm_stderr,none": 0.004946485466544623}, "mmlu": {"acc,none": 0.24718701039737928, "acc_stderr,none": 0.0036378127366927093, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2563230605738576, "acc_stderr,none": 0.006365374138257683, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.03764950879790606}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.034277431758165236}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.029331162294251728}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955927}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3305785123966942, "acc_stderr,none": 0.04294340845212094}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25153374233128833, "acc_stderr,none": 0.034089978868575295}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.02335736578587403}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2057877813504823, "acc_stderr,none": 0.022961339906764248}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.024659685185967273}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26140808344198174, "acc_stderr,none": 0.01122252816977131}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824563}, "mmlu_other": {"acc,none": 0.2507241712262633, "acc_stderr,none": 0.00774150137751743, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.20754716981132076, "acc_stderr,none": 0.02495991802891127}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.03214737302029468}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.336322869955157, "acc_stderr,none": 0.031708824268455005}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.31196581196581197, "acc_stderr,none": 0.030351527323344948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2681992337164751, "acc_stderr,none": 0.01584243083526944}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21895424836601307, "acc_stderr,none": 0.02367908986180772}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880596}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.14338235294117646, "acc_stderr,none": 0.021289071205445133}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21686746987951808, "acc_stderr,none": 0.03208284450356365}, "mmlu_social_sciences": {"acc,none": 0.23204419889502761, "acc_stderr,none": 0.00761814085194262, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.031156269519646836}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23834196891191708, "acc_stderr,none": 0.030748905363909902}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21794871794871795, "acc_stderr,none": 0.02093244577446319}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.026653531596715484}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22385321100917432, "acc_stderr,none": 0.01787121776779022}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.01740181671142765}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721376}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.025801283475090506}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.030147775935409217}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_stem": {"acc,none": 0.24484617824294322, "acc_stderr,none": 0.007667038038724333, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.24342105263157895, "acc_stderr,none": 0.034923496688842384}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462457}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653696}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.03708284662416542}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.028504856470514192}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727772}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.022418042891113942}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25483870967741934, "acc_stderr,none": 0.024790118459332204}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2315270935960591, "acc_stderr,none": 0.029678333141444455}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.026962424325073828}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.03573705314763457}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02835321286686343}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.20535714285714285, "acc_stderr,none": 0.038342410214190735}, "mmlu_pro": {"exact_match,custom-extract": 0.05751329787234043, "exact_match_stderr,custom-extract": 0.0021121138773426604, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.029288702928870293, "exact_match_stderr,custom-extract": 0.006301422514936768}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.0697084917617237, "exact_match_stderr,custom-extract": 0.009071712178541184}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.022968197879858657, "exact_match_stderr,custom-extract": 0.004454372251378946}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.05609756097560976, "exact_match_stderr,custom-extract": 0.011378208553976957}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.08412322274881516, "exact_match_stderr,custom-extract": 0.009560105553845805}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0392156862745098, "exact_match_stderr,custom-extract": 0.006238859180035595}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.07946210268948656, "exact_match_stderr,custom-extract": 0.009462150130848647}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.08136482939632546, "exact_match_stderr,custom-extract": 0.014024845803977637}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.029064486830154404, "exact_match_stderr,custom-extract": 0.005065008519732089}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.04293116210214656, "exact_match_stderr,custom-extract": 0.005516848095217846}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09307359307359307, "exact_match_stderr,custom-extract": 0.00956309374740362}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.0841683366733467, "exact_match_stderr,custom-extract": 0.012441350584540501}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.05542725173210162, "exact_match_stderr,custom-extract": 0.006350998823300624}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.09022556390977443, "exact_match_stderr,custom-extract": 0.010148515199847084}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.2, "acc_stderr,none": 0.017906459241433848, "acc_norm,none": 0.308, "acc_norm_stderr,none": 0.020667032987466104}, "piqa": {"alias": "piqa", "acc,none": 0.6354733405875952, "acc_stderr,none": 0.011229456510295964, "acc_norm,none": 0.6376496191512514, "acc_norm_stderr,none": 0.01121504021510457}, "race": {"alias": "race", "acc,none": 0.33779904306220093, "acc_stderr,none": 0.014637734314782855}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.3930399181166837, "acc_stderr,none": 0.011052162799729233}, "winogrande": {"alias": "winogrande", "acc,none": 0.5548539857932123, "acc_stderr,none": 0.013967662954355487}} {"created_at": "2025-04-14T00:42:25.823697", "global_step": 24000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2636518771331058, "acc_stderr,none": 0.012875929151297061, "acc_norm,none": 0.2960750853242321, "acc_norm_stderr,none": 0.013340916085246263}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5656565656565656, "acc_stderr,none": 0.010170943451269425, "acc_norm,none": 0.5425084175084175, "acc_norm_stderr,none": 0.010222638127749492}, "boolq": {"alias": "boolq", "acc,none": 0.5840978593272171, "acc_stderr,none": 0.008620469604001021}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18345618345618345, "acc_stderr,none": 0.01108092483741121}, "copa": {"alias": "copa", "acc,none": 0.66, "acc_stderr,none": 0.04760952285695237}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3579964150567616, "acc_stderr,none": 0.004784312972495394, "acc_norm,none": 0.43995220075682134, "acc_norm_stderr,none": 0.004953667028654386}, "mmlu": {"acc,none": 0.23757299529981485, "acc_stderr,none": 0.003589918777320869, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24378320935175346, "acc_stderr,none": 0.006256236094621989, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.03932537680392871}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.031922715695483}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693268}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.04026187527591204}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0230836585869842}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2335195530726257, "acc_stderr,none": 0.014149575348976267}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.02197419884826582}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22839506172839505, "acc_stderr,none": 0.023358211840626267}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24967405475880053, "acc_stderr,none": 0.01105453837783232}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03615507630310935}, "mmlu_other": {"acc,none": 0.24782748632121018, "acc_stderr,none": 0.007737754890455017, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899098}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641145}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.16, "acc_stderr,none": 0.0368452949177471}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3004484304932735, "acc_stderr,none": 0.030769352008229132}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24393358876117496, "acc_stderr,none": 0.015357212665829472}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912258}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.025257861359432414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.25735294117647056, "acc_stderr,none": 0.026556519470041513}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553027}, "mmlu_social_sciences": {"acc,none": 0.22261943451413715, "acc_stderr,none": 0.007503704750842644, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893624}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20202020202020202, "acc_stderr,none": 0.028606204289229872}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551975}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.027025433498882374}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22018348623853212, "acc_stderr,none": 0.01776597865232756}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.037276735755969174}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.01728276069516741}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724137}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23383084577114427, "acc_stderr,none": 0.029929415408348384}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.2327941642879797, "acc_stderr,none": 0.007529241869554225, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21710526315789475, "acc_stderr,none": 0.033550453048829226}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165085}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.03793281185307809}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.029241883869628827}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2328042328042328, "acc_stderr,none": 0.021765961672154523}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2, "acc_stderr,none": 0.02275520495954294}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2019704433497537, "acc_stderr,none": 0.028247350122180284}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.025644108639267603}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2185430463576159, "acc_stderr,none": 0.033742355504256936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.02915752218460559}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.0432704093257873}, "mmlu_pro": {"exact_match,custom-extract": 0.08868018617021277, "exact_match_stderr,custom-extract": 0.0025839522776810266, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.0794979079497908, "exact_match_stderr,custom-extract": 0.010109594327175198}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08998732572877059, "exact_match_stderr,custom-extract": 0.010194156217460268}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.053003533568904596, "exact_match_stderr,custom-extract": 0.006661856730672935}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.06829268292682927, "exact_match_stderr,custom-extract": 0.012472835264061375}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10308056872037914, "exact_match_stderr,custom-extract": 0.010472523223760194}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.06191950464396285, "exact_match_stderr,custom-extract": 0.0077463320824784476}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12102689486552567, "exact_match_stderr,custom-extract": 0.011410842488489007}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12073490813648294, "exact_match_stderr,custom-extract": 0.016714159620683285}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08356039963669391, "exact_match_stderr,custom-extract": 0.008343645336381308}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06661732050333087, "exact_match_stderr,custom-extract": 0.006786667382246557}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11255411255411256, "exact_match_stderr,custom-extract": 0.010402812578120457}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14228456913827656, "exact_match_stderr,custom-extract": 0.0156543789197872}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08622016936104696, "exact_match_stderr,custom-extract": 0.0077909043682681586}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11278195488721804, "exact_match_stderr,custom-extract": 0.011204844440903063}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.204, "acc_stderr,none": 0.018039369104138652, "acc_norm,none": 0.312, "acc_norm_stderr,none": 0.02074059653648808}, "piqa": {"alias": "piqa", "acc,none": 0.6534276387377584, "acc_stderr,none": 0.011103020320872181, "acc_norm,none": 0.6496191512513602, "acc_norm_stderr,none": 0.011131277554681728}, "race": {"alias": "race", "acc,none": 0.32344497607655504, "acc_stderr,none": 0.01447776480941772}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4048106448311157, "acc_stderr,none": 0.011107144401926746}, "winogrande": {"alias": "winogrande", "acc,none": 0.5359116022099447, "acc_stderr,none": 0.01401619343395831}} {"created_at": "2025-04-14T02:21:25.018345", "global_step": 26000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.26706484641638223, "acc_stderr,none": 0.01292893319649635, "acc_norm,none": 0.3199658703071672, "acc_norm_stderr,none": 0.013631345807016196}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5732323232323232, "acc_stderr,none": 0.010149141043955638, "acc_norm,none": 0.5526094276094277, "acc_norm_stderr,none": 0.010202832385415646}, "boolq": {"alias": "boolq", "acc,none": 0.6415902140672783, "acc_stderr,none": 0.008387090607540327}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21457821457821458, "acc_stderr,none": 0.011753423094216855}, "copa": {"alias": "copa", "acc,none": 0.63, "acc_stderr,none": 0.048523658709391}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3527185819557857, "acc_stderr,none": 0.004768395354146805, "acc_norm,none": 0.4382593108942442, "acc_norm_stderr,none": 0.0049515940632720535}, "mmlu": {"acc,none": 0.23123486682808717, "acc_stderr,none": 0.003552203369794013, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24165781083953242, "acc_stderr,none": 0.0062401116384119696, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.0393253768039287}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03346409881055953}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.029041333510598025}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854933}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615767}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.023176298203992012}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18971061093247588, "acc_stderr,none": 0.022268196258783218}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2191358024691358, "acc_stderr,none": 0.023016705640262196}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24119947848761408, "acc_stderr,none": 0.010926496102034956}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708312}, "mmlu_other": {"acc,none": 0.2452526552944963, "acc_stderr,none": 0.007700965994000229, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252606}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.19653179190751446, "acc_stderr,none": 0.030299574664788147}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.1650485436893204, "acc_stderr,none": 0.03675668832233188}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.029614323690456655}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.22988505747126436, "acc_stderr,none": 0.015046301846691812}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888135}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.025518731049537766}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.026799562024887674}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.2183945401364966, "acc_stderr,none": 0.007444073460885268, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.020377660970371386}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1944954128440367, "acc_stderr,none": 0.01697028909045805}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.017630827375148383}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.040693063197213754}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.02500025603954621}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23383084577114427, "acc_stderr,none": 0.029929415408348384}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.2143989850935617, "acc_stderr,none": 0.0072931260632868086, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.02094048156533485}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.022037217340267846}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1625615763546798, "acc_stderr,none": 0.02596030006460558}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655106}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1574074074074074, "acc_stderr,none": 0.02483717351824239}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3482142857142857, "acc_stderr,none": 0.04521829902833585}, "mmlu_pro": {"exact_match,custom-extract": 0.08527260638297872, "exact_match_stderr,custom-extract": 0.0025389850415595554, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.07252440725244072, "exact_match_stderr,custom-extract": 0.00969252271883848}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.07351077313054499, "exact_match_stderr,custom-extract": 0.009296780415650432}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.046819787985865724, "exact_match_stderr,custom-extract": 0.006281609400208295}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07317073170731707, "exact_match_stderr,custom-extract": 0.012876769299821081}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11018957345971564, "exact_match_stderr,custom-extract": 0.010784628980842382}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.06914344685242518, "exact_match_stderr,custom-extract": 0.00815415972849956}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12469437652811736, "exact_match_stderr,custom-extract": 0.011558254824072241}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09973753280839895, "exact_match_stderr,custom-extract": 0.015371706524248126}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08719346049046321, "exact_match_stderr,custom-extract": 0.00850618817194349}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06587712805329386, "exact_match_stderr,custom-extract": 0.00675153382039657}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10064935064935066, "exact_match_stderr,custom-extract": 0.00990305439291375}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.015561893867712498}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.0869899923017706, "exact_match_stderr,custom-extract": 0.007822310824931377}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.09022556390977443, "exact_match_stderr,custom-extract": 0.010148515199847065}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.22, "acc_stderr,none": 0.01854421137582033, "acc_norm,none": 0.322, "acc_norm_stderr,none": 0.020916668330019882}, "piqa": {"alias": "piqa", "acc,none": 0.6463547334058759, "acc_stderr,none": 0.011154877708188685, "acc_norm,none": 0.6458106637649619, "acc_norm_stderr,none": 0.011158755672626114}, "race": {"alias": "race", "acc,none": 0.31100478468899523, "acc_stderr,none": 0.014326542383166054}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.38536335721596726, "acc_stderr,none": 0.011012687375082745}, "winogrande": {"alias": "winogrande", "acc,none": 0.5501183898973955, "acc_stderr,none": 0.013981711904049733}} {"created_at": "2025-04-14T07:37:55.530424", "global_step": 28000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.27303754266211605, "acc_stderr,none": 0.013019332762635746, "acc_norm,none": 0.3037542662116041, "acc_norm_stderr,none": 0.013438909184778764}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5542929292929293, "acc_stderr,none": 0.010199118183322992, "acc_norm,none": 0.523989898989899, "acc_norm_stderr,none": 0.01024796739274269}, "boolq": {"alias": "boolq", "acc,none": 0.43058103975535167, "acc_stderr,none": 0.00866036014598874}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1981981981981982, "acc_stderr,none": 0.011413095456219316}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.047258156262526066}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3565026887074288, "acc_stderr,none": 0.0047798722506337135, "acc_norm,none": 0.4427404899422426, "acc_norm_stderr,none": 0.004956953917781309}, "mmlu": {"acc,none": 0.2318758011679248, "acc_stderr,none": 0.003555491566162252, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24335812964930925, "acc_stderr,none": 0.006253203492425039, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0404061017820884}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955924}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.021974198848265823}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023132376234543332}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24641460234680573, "acc_stderr,none": 0.011005971399927237}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708312}, "mmlu_other": {"acc,none": 0.2452526552944963, "acc_stderr,none": 0.007697373187567178, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21132075471698114, "acc_stderr,none": 0.025125766484827845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641144}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3273542600896861, "acc_stderr,none": 0.03149384670994131}, "mmlu_management": {"alias": " - management", "acc,none": 0.1650485436893204, "acc_stderr,none": 0.03675668832233188}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02987257770889117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24521072796934865, "acc_stderr,none": 0.01538435228454395}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888135}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729903}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1948529411764706, "acc_stderr,none": 0.024060599423487424}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3072289156626506, "acc_stderr,none": 0.03591566797824663}, "mmlu_social_sciences": {"acc,none": 0.21806954826129346, "acc_stderr,none": 0.0074436180359848106, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551975}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1981651376146789, "acc_stderr,none": 0.017090573804217885}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24673202614379086, "acc_stderr,none": 0.017440820367402507}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.21503330161750714, "acc_stderr,none": 0.007304042865185667, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17763157894736842, "acc_stderr,none": 0.031103182383123398}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727771}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.02094048156533485}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.022037217340267846}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15270935960591134, "acc_stderr,none": 0.025308904539380627}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.025416428388767478}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.09275265957446809, "exact_match_stderr,custom-extract": 0.002641146171863864, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.08786610878661087, "exact_match_stderr,custom-extract": 0.01057994675518786}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09252217997465145, "exact_match_stderr,custom-extract": 0.010322332141863086}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07950530035335689, "exact_match_stderr,custom-extract": 0.008044098592471979}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07560975609756097, "exact_match_stderr,custom-extract": 0.013072388347810092}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11018957345971564, "exact_match_stderr,custom-extract": 0.01078462898084238}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.058823529411764705, "exact_match_stderr,custom-extract": 0.007562639370979067}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1198044009779951, "exact_match_stderr,custom-extract": 0.01136095799507454}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10761154855643044, "exact_match_stderr,custom-extract": 0.015896979452723375}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09355131698455948, "exact_match_stderr,custom-extract": 0.008780115347917658}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07179866765358993, "exact_match_stderr,custom-extract": 0.0070260688643428515}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11688311688311688, "exact_match_stderr,custom-extract": 0.010575091539720229}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.08817635270541083, "exact_match_stderr,custom-extract": 0.012706233135747366}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10546574287913779, "exact_match_stderr,custom-extract": 0.008525440942560907}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10150375939849623, "exact_match_stderr,custom-extract": 0.010697198018805535}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.206, "acc_stderr,none": 0.01810479403733355, "acc_norm,none": 0.332, "acc_norm_stderr,none": 0.021081766571222856}, "piqa": {"alias": "piqa", "acc,none": 0.6365614798694232, "acc_stderr,none": 0.01122227939530451, "acc_norm,none": 0.6371055495103374, "acc_norm_stderr,none": 0.011218667570840881}, "race": {"alias": "race", "acc,none": 0.3282296650717703, "acc_stderr,none": 0.014532792620129664}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.3858751279426817, "acc_stderr,none": 0.01101540869248702}, "winogrande": {"alias": "winogrande", "acc,none": 0.56353591160221, "acc_stderr,none": 0.013938569465677024}} {"created_at": "2025-04-14T07:39:42.957389", "global_step": 30000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2781569965870307, "acc_stderr,none": 0.01309446991953879, "acc_norm,none": 0.3395904436860068, "acc_norm_stderr,none": 0.013839039762820169}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5669191919191919, "acc_stderr,none": 0.010167478013701785, "acc_norm,none": 0.539983164983165, "acc_norm_stderr,none": 0.010226927233491508}, "boolq": {"alias": "boolq", "acc,none": 0.6217125382262997, "acc_stderr,none": 0.008482001133930994}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20393120393120392, "acc_stderr,none": 0.011535521334313655}, "copa": {"alias": "copa", "acc,none": 0.63, "acc_stderr,none": 0.048523658709391}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.35680143397729536, "acc_stderr,none": 0.004780764443411322, "acc_norm,none": 0.44164509061939855, "acc_norm_stderr,none": 0.004955681533284345}, "mmlu": {"acc,none": 0.2591511180743484, "acc_stderr,none": 0.003694789992996243, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2450584484590861, "acc_stderr,none": 0.006269717959501759, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3412698412698413, "acc_stderr,none": 0.04240799327574924}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.03287666758603488}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.02977177522814562}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22784810126582278, "acc_stderr,none": 0.02730348459906941}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19008264462809918, "acc_stderr,none": 0.035817969517092825}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507416}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615767}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.023445826276545536}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2681564245810056, "acc_stderr,none": 0.014816119635317005}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24115755627009647, "acc_stderr,none": 0.024296594034763426}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23076923076923078, "acc_stderr,none": 0.010760840584471697}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.03301405946987249}, "mmlu_other": {"acc,none": 0.2632764724814934, "acc_stderr,none": 0.007902234795521329, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24528301886792453, "acc_stderr,none": 0.026480357179895688}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.03435568056047875}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.25112107623318386, "acc_stderr,none": 0.0291052208332246}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.041858325989283136}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623101}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24904214559386972, "acc_stderr,none": 0.01546467616339599}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02564686309713791}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.026469036818590634}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3272058823529412, "acc_stderr,none": 0.028501452860396567}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.033293941190735296}, "mmlu_social_sciences": {"acc,none": 0.2622684432889178, "acc_stderr,none": 0.007922517592365319, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518752}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2676767676767677, "acc_stderr,none": 0.031544498882702866}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27461139896373055, "acc_stderr,none": 0.03221024508041154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.30512820512820515, "acc_stderr,none": 0.023346335293325884}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.029953823891887037}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24770642201834864, "acc_stderr,none": 0.018508143602547808}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.31297709923664124, "acc_stderr,none": 0.04066962905677698}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2630718954248366, "acc_stderr,none": 0.017812676542320653}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.041723430387053825}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24897959183673468, "acc_stderr,none": 0.027682979522960238}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.1890547263681592, "acc_stderr,none": 0.027686913588013024}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_stem": {"acc,none": 0.2730732635585157, "acc_stderr,none": 0.007926269787888622, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640767}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.20833333333333334, "acc_stderr,none": 0.033961162058453336}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993177}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23829787234042554, "acc_stderr,none": 0.02785125297388976}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.32413793103448274, "acc_stderr,none": 0.03900432069185555}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02306818884826112}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27419354838709675, "acc_stderr,none": 0.025378139970885196}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.31527093596059114, "acc_stderr,none": 0.03269080871970186}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.027940457136228416}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.39072847682119205, "acc_stderr,none": 0.039837983066598075}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.27314814814814814, "acc_stderr,none": 0.030388051301678116}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03894641120044792}, "mmlu_pro": {"exact_match,custom-extract": 0.07322140957446809, "exact_match_stderr,custom-extract": 0.0023641043841093733, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.05299860529986053, "exact_match_stderr,custom-extract": 0.008372420234387909}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09632446134347275, "exact_match_stderr,custom-extract": 0.010510211344233732}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.038869257950530034, "exact_match_stderr,custom-extract": 0.005747289272768996}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.06097560975609756, "exact_match_stderr,custom-extract": 0.011831910023092125}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.08056872037914692, "exact_match_stderr,custom-extract": 0.009374089227447273}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.05263157894736842, "exact_match_stderr,custom-extract": 0.007177033492822924}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10635696821515893, "exact_match_stderr,custom-extract": 0.010785840230716593}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09711286089238845, "exact_match_stderr,custom-extract": 0.015190193611399455}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.048138056312443236, "exact_match_stderr,custom-extract": 0.006454088474962139}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.049592894152479645, "exact_match_stderr,custom-extract": 0.005908778090269268}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10173160173160173, "exact_match_stderr,custom-extract": 0.009950161991784681}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13226452905811623, "exact_match_stderr,custom-extract": 0.015181011139551245}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07159353348729793, "exact_match_stderr,custom-extract": 0.0071559719718151445}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10275689223057644, "exact_match_stderr,custom-extract": 0.010755519334168299}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.214, "acc_stderr,none": 0.01835979750238701, "acc_norm,none": 0.322, "acc_norm_stderr,none": 0.020916668330019882}, "piqa": {"alias": "piqa", "acc,none": 0.6550598476605005, "acc_stderr,none": 0.01109067010299315, "acc_norm,none": 0.6463547334058759, "acc_norm_stderr,none": 0.011154877708188689}, "race": {"alias": "race", "acc,none": 0.3320574162679426, "acc_stderr,none": 0.014575582129545918}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.38331627430910953, "acc_stderr,none": 0.011001673478784571}, "winogrande": {"alias": "winogrande", "acc,none": 0.5540647198105761, "acc_stderr,none": 0.013970093482330699}} {"created_at": "2025-04-14T07:47:04.494998", "global_step": 32000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2815699658703072, "acc_stderr,none": 0.013143376735009019, "acc_norm,none": 0.32337883959044367, "acc_norm_stderr,none": 0.013669421630012127}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5787037037037037, "acc_stderr,none": 0.010131882498193134, "acc_norm,none": 0.5517676767676768, "acc_norm_stderr,none": 0.010204645126856928}, "boolq": {"alias": "boolq", "acc,none": 0.5792048929663609, "acc_stderr,none": 0.008634635146574865}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20966420966420968, "acc_stderr,none": 0.011654350093704639}, "copa": {"alias": "copa", "acc,none": 0.65, "acc_stderr,none": 0.047937248544110196}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.35600477992431784, "acc_stderr,none": 0.004778380758851136, "acc_norm,none": 0.44383588926508666, "acc_norm_stderr,none": 0.004958201874334091}, "mmlu": {"acc,none": 0.23657598632673407, "acc_stderr,none": 0.003583773892648426, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23719447396386822, "acc_stderr,none": 0.0062017908825259755, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604676}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.031922715695483}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.03166009679399813}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.02917868230484256}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.17355371900826447, "acc_stderr,none": 0.0345727283691767}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.041331194402438376}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.20245398773006135, "acc_stderr,none": 0.031570650789119026}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23575418994413408, "acc_stderr,none": 0.014196375686290804}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2057877813504823, "acc_stderr,none": 0.02296133990676425}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2345679012345679, "acc_stderr,none": 0.02357688174400572}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.242503259452412, "acc_stderr,none": 0.01094657096634877}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.031267817146631786}, "mmlu_other": {"acc,none": 0.2452526552944963, "acc_stderr,none": 0.007712029365728494, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.026199808807561904}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.030631145539198823}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.029480360549541194}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23116219667943805, "acc_stderr,none": 0.015075523238101102}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.02355083135199509}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.22340425531914893, "acc_stderr,none": 0.024847921358063962}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.28308823529411764, "acc_stderr,none": 0.02736586113151381}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.26506024096385544, "acc_stderr,none": 0.03436024037944967}, "mmlu_social_sciences": {"acc,none": 0.24536886577835554, "acc_stderr,none": 0.00775590219999502, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2878787878787879, "acc_stderr,none": 0.03225883512300993}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2694300518134715, "acc_stderr,none": 0.032018671228777947}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.258974358974359, "acc_stderr,none": 0.022211106810061675}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279483}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26055045871559634, "acc_stderr,none": 0.01881918203485007}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467765}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.238562091503268, "acc_stderr,none": 0.017242385828779593}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.15454545454545454, "acc_stderr,none": 0.03462262571262667}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20816326530612245, "acc_stderr,none": 0.025991117672813296}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22388059701492538, "acc_stderr,none": 0.029475250236017176}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_stem": {"acc,none": 0.2185220424992071, "acc_stderr,none": 0.007351355714143667, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325435}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.032790004063100515}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2851063829787234, "acc_stderr,none": 0.029513196625539355}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21164021164021163, "acc_stderr,none": 0.02103733150526289}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2161290322580645, "acc_stderr,none": 0.023415293433568532}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.17733990147783252, "acc_stderr,none": 0.026874337276808342}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.02488211685765511}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.18543046357615894, "acc_stderr,none": 0.031732843842942865}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.0432704093257873}, "mmlu_pro": {"exact_match,custom-extract": 0.08319481382978723, "exact_match_stderr,custom-extract": 0.002511751770297542, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.05299860529986053, "exact_match_stderr,custom-extract": 0.008372420234387898}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08618504435994931, "exact_match_stderr,custom-extract": 0.00999728278366823}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.08303886925795052, "exact_match_stderr,custom-extract": 0.008205118814321247}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07560975609756097, "exact_match_stderr,custom-extract": 0.013072388347810092}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1018957345971564, "exact_match_stderr,custom-extract": 0.010419037340753801}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07017543859649122, "exact_match_stderr,custom-extract": 0.008210231372398007}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10146699266503667, "exact_match_stderr,custom-extract": 0.010563756545064423}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12335958005249344, "exact_match_stderr,custom-extract": 0.016869623436798514}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.05267938237965486, "exact_match_stderr,custom-extract": 0.006735541083195283}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.056254626202812734, "exact_match_stderr,custom-extract": 0.00627104121743841}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09415584415584416, "exact_match_stderr,custom-extract": 0.009612791742161186}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12625250501002003, "exact_match_stderr,custom-extract": 0.014883268009546953}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09853733641262509, "exact_match_stderr,custom-extract": 0.008272503032902292}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.09273182957393483, "exact_match_stderr,custom-extract": 0.010274320069747482}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.224, "acc_stderr,none": 0.018663994464710794, "acc_norm,none": 0.312, "acc_norm_stderr,none": 0.02074059653648808}, "piqa": {"alias": "piqa", "acc,none": 0.6414581066376496, "acc_stderr,none": 0.011189212572356357, "acc_norm,none": 0.6479869423286181, "acc_norm_stderr,none": 0.011143148953066093}, "race": {"alias": "race", "acc,none": 0.34258373205741627, "acc_stderr,none": 0.014687684737145162}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.394575230296827, "acc_stderr,none": 0.011059713589720794}, "winogrande": {"alias": "winogrande", "acc,none": 0.5588003157063931, "acc_stderr,none": 0.013954975072834724}} {"created_at": "2025-04-14T14:46:20.346041", "global_step": 34000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.26109215017064846, "acc_stderr,none": 0.012835523909473855, "acc_norm,none": 0.3003412969283277, "acc_norm_stderr,none": 0.013395909309957}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5488215488215489, "acc_stderr,none": 0.010210757101073473, "acc_norm,none": 0.5353535353535354, "acc_norm_stderr,none": 0.010234104543411431}, "boolq": {"alias": "boolq", "acc,none": 0.6107033639143731, "acc_stderr,none": 0.008528016290984547}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19656019656019655, "acc_stderr,none": 0.011377439773964009}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.04725815626252607}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3584943238398725, "acc_stderr,none": 0.0047857819793548665, "acc_norm,none": 0.44971121290579563, "acc_norm_stderr,none": 0.004964479324552526}, "mmlu": {"acc,none": 0.23814271471300386, "acc_stderr,none": 0.0035897171785443565, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24420828905419767, "acc_stderr,none": 0.00626055031157178, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.17575757575757575, "acc_stderr,none": 0.02972094300622445}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.21487603305785125, "acc_stderr,none": 0.03749492448709699}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094633}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.20245398773006135, "acc_stderr,none": 0.03157065078911904}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0230836585869842}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225601}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24382716049382716, "acc_stderr,none": 0.02389187954195961}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2607561929595828, "acc_stderr,none": 0.011213471559602334}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.26900584795321636, "acc_stderr,none": 0.0340105262010409}, "mmlu_other": {"acc,none": 0.24750563244287094, "acc_stderr,none": 0.007729849735702887, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899098}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641144}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3004484304932735, "acc_stderr,none": 0.030769352008229136}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02987257770889117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26053639846743293, "acc_stderr,none": 0.01569600856380708}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.024170840879341026}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17279411764705882, "acc_stderr,none": 0.02296606758558181}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.03329394119073531}, "mmlu_social_sciences": {"acc,none": 0.21806954826129346, "acc_stderr,none": 0.007437640288734209, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022057}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1919191919191919, "acc_stderr,none": 0.02805779167298902}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.17098445595854922, "acc_stderr,none": 0.02717121368316455}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.19230769230769232, "acc_stderr,none": 0.019982347208637303}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.2018348623853211, "acc_stderr,none": 0.01720857935778758}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306085}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.017479487001364764}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.04172343038705383}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.17959183673469387, "acc_stderr,none": 0.024573293589585637}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.030965903123573023}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.23945448778940692, "acc_stderr,none": 0.007589540273310397, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325435}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.03715062154998905}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462457}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179961}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.02924188386962882}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.02193587808118476}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.20967741935483872, "acc_stderr,none": 0.02315787934908352}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1724137931034483, "acc_stderr,none": 0.02657767218303658}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.027309140588230172}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.0347918557259966}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16203703703703703, "acc_stderr,none": 0.025130453652268455}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.042878587513404544}, "mmlu_pro": {"exact_match,custom-extract": 0.0694813829787234, "exact_match_stderr,custom-extract": 0.002304408831563505, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.044630404463040445, "exact_match_stderr,custom-extract": 0.007716930840229505}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10012674271229405, "exact_match_stderr,custom-extract": 0.010693074879962119}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.029151943462897525, "exact_match_stderr,custom-extract": 0.005002396212309428}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.06341463414634146, "exact_match_stderr,custom-extract": 0.012050547403328612}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11137440758293839, "exact_match_stderr,custom-extract": 0.010835234740619492}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.047471620227038186, "exact_match_stderr,custom-extract": 0.006834681966463692}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.08801955990220049, "exact_match_stderr,custom-extract": 0.00991222907127109}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10236220472440945, "exact_match_stderr,custom-extract": 0.015549935163883116}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.04632152588555858, "exact_match_stderr,custom-extract": 0.006337180483262906}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.03997039230199852, "exact_match_stderr,custom-extract": 0.005331441333936072}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09740259740259741, "exact_match_stderr,custom-extract": 0.009759587414564116}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11422845691382766, "exact_match_stderr,custom-extract": 0.014253888115016511}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.06312548113933796, "exact_match_stderr,custom-extract": 0.00675003023278425}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10150375939849623, "exact_match_stderr,custom-extract": 0.010697198018805514}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.21, "acc_stderr,none": 0.018233620865305916, "acc_norm,none": 0.32, "acc_norm_stderr,none": 0.020882340488761808}, "piqa": {"alias": "piqa", "acc,none": 0.6430903155603918, "acc_stderr,none": 0.011177909079261196, "acc_norm,none": 0.6463547334058759, "acc_norm_stderr,none": 0.01115487770818867}, "race": {"alias": "race", "acc,none": 0.30526315789473685, "acc_stderr,none": 0.014252698955501592}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.39048106448311154, "acc_stderr,none": 0.01103932371486307}, "winogrande": {"alias": "winogrande", "acc,none": 0.5753749013417522, "acc_stderr,none": 0.013891893150264232}} {"created_at": "2025-04-14T14:59:54.398193", "global_step": 36000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.29266211604095566, "acc_stderr,none": 0.013295916103619411, "acc_norm,none": 0.3395904436860068, "acc_norm_stderr,none": 0.013839039762820169}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5972222222222222, "acc_stderr,none": 0.010063960494989163, "acc_norm,none": 0.5782828282828283, "acc_norm_stderr,none": 0.010133255284012316}, "boolq": {"alias": "boolq", "acc,none": 0.6321100917431193, "acc_stderr,none": 0.00843427659109304}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20638820638820637, "acc_stderr,none": 0.011586881879177831}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.045126085985421276}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3584943238398725, "acc_stderr,none": 0.004785781979354867, "acc_norm,none": 0.4493128858793069, "acc_norm_stderr,none": 0.004964075870120345}, "mmlu": {"acc,none": 0.25345392394245836, "acc_stderr,none": 0.0036685898013330527, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25313496280552605, "acc_stderr,none": 0.0063284068719425424, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1746031746031746, "acc_stderr,none": 0.03395490020856111}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.03192271569548299}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.03182231867647553}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3206751054852321, "acc_stderr,none": 0.030381931949990407}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302872}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252628}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2883435582822086, "acc_stderr,none": 0.035590395316173425}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123563}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.17363344051446947, "acc_stderr,none": 0.02151405158597041}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24382716049382716, "acc_stderr,none": 0.023891879541959607}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27053455019556716, "acc_stderr,none": 0.011345996743539258}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.034462962170884265}, "mmlu_other": {"acc,none": 0.2413904087544255, "acc_stderr,none": 0.007677123472078534, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.02590789712240817}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641144}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909281}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.26905829596412556, "acc_stderr,none": 0.02976377940687498}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.029614323690456648}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2541507024265645, "acc_stderr,none": 0.015569254692045773}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888135}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.025645553622266733}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19852941176470587, "acc_stderr,none": 0.024231013370541087}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21686746987951808, "acc_stderr,none": 0.03208284450356365}, "mmlu_social_sciences": {"acc,none": 0.25706857328566785, "acc_stderr,none": 0.007886546040906965, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518753}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2474747474747475, "acc_stderr,none": 0.030746300742124495}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2538860103626943, "acc_stderr,none": 0.03141024780565318}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.258974358974359, "acc_stderr,none": 0.02221110681006166}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.029344572500634342}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24770642201834864, "acc_stderr,none": 0.018508143602547825}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.03641297081313729}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.017952449196987866}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.04309118709946459}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24489795918367346, "acc_stderr,none": 0.02752963744017492}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2835820895522388, "acc_stderr,none": 0.03187187537919797}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_stem": {"acc,none": 0.2622898826514431, "acc_stderr,none": 0.00782908290132899, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.25, "acc_stderr,none": 0.03523807393012047}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.038009680605548574}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.042801058373643966}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3021276595744681, "acc_stderr,none": 0.030017554471880557}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707841}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.022261817692400192}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23870967741935484, "acc_stderr,none": 0.024251071262208834}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.030108330718011625}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21851851851851853, "acc_stderr,none": 0.025195752251823782}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33774834437086093, "acc_stderr,none": 0.038615575462551684}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046972}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.04327040932578729}, "mmlu_pro": {"exact_match,custom-extract": 0.07288896276595745, "exact_match_stderr,custom-extract": 0.0023599536423174914, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.03486750348675035, "exact_match_stderr,custom-extract": 0.0068556302237254745}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08365019011406843, "exact_match_stderr,custom-extract": 0.009862817667640083}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.030918727915194347, "exact_match_stderr,custom-extract": 0.005147064453055614}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.046341463414634146, "exact_match_stderr,custom-extract": 0.010394884507288395}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.0924170616113744, "exact_match_stderr,custom-extract": 0.009974828838692062}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.054695562435500514, "exact_match_stderr,custom-extract": 0.007308432091333981}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.09902200488997555, "exact_match_stderr,custom-extract": 0.010449894873209579}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10498687664041995, "exact_match_stderr,custom-extract": 0.015724991203554552}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.05722070844686648, "exact_match_stderr,custom-extract": 0.007003018762823531}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06217616580310881, "exact_match_stderr,custom-extract": 0.006572123520013909}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11255411255411256, "exact_match_stderr,custom-extract": 0.010402812578120448}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10420841683366733, "exact_match_stderr,custom-extract": 0.013691159072055339}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07929176289453425, "exact_match_stderr,custom-extract": 0.007499593169839573}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.09273182957393483, "exact_match_stderr,custom-extract": 0.01027432006974746}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.22, "acc_stderr,none": 0.01854421137582033, "acc_norm,none": 0.314, "acc_norm_stderr,none": 0.020776701920308997}, "piqa": {"alias": "piqa", "acc,none": 0.6420021762785637, "acc_stderr,none": 0.0111854604166173, "acc_norm,none": 0.6420021762785637, "acc_norm_stderr,none": 0.0111854604166173}, "race": {"alias": "race", "acc,none": 0.3253588516746411, "acc_stderr,none": 0.014499982471636879}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41453428863868985, "acc_stderr,none": 0.01114756056703673}, "winogrande": {"alias": "winogrande", "acc,none": 0.5477505919494869, "acc_stderr,none": 0.013988256216606028}} {"created_at": "2025-04-14T15:02:50.917780", "global_step": 38000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2815699658703072, "acc_stderr,none": 0.01314337673500902, "acc_norm,none": 0.3122866894197952, "acc_norm_stderr,none": 0.013542598541688065}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5753367003367004, "acc_stderr,none": 0.01014265368748041, "acc_norm,none": 0.553030303030303, "acc_norm_stderr,none": 0.010201914927791676}, "boolq": {"alias": "boolq", "acc,none": 0.6440366972477064, "acc_stderr,none": 0.008374337517726586}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19246519246519248, "acc_stderr,none": 0.011286955409752646}, "copa": {"alias": "copa", "acc,none": 0.63, "acc_stderr,none": 0.048523658709391}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.35909181437960563, "acc_stderr,none": 0.004787537385153002, "acc_norm,none": 0.4471220872336188, "acc_norm_stderr,none": 0.004961799358836435}, "mmlu": {"acc,none": 0.25409485828229594, "acc_stderr,none": 0.0036705507019836174, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2629117959617428, "acc_stderr,none": 0.006417512477017649, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.0361960452412425}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139406}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.03182231867647553}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.04139112727635462}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.0413311944024384}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2883435582822086, "acc_stderr,none": 0.03559039531617342}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123563}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.01431099954796146}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.29260450160771706, "acc_stderr,none": 0.02583989833487798}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3148148148148148, "acc_stderr,none": 0.025842248700902168}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2646675358539765, "acc_stderr,none": 0.011267332992845535}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.25146198830409355, "acc_stderr,none": 0.033275044238468436}, "mmlu_other": {"acc,none": 0.24943675571290633, "acc_stderr,none": 0.0077417362315356465, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2037735849056604, "acc_stderr,none": 0.02479078450177541}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.030952890217749884}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.20179372197309417, "acc_stderr,none": 0.026936111912802256}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822583}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23504273504273504, "acc_stderr,none": 0.027778835904935448}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2835249042145594, "acc_stderr,none": 0.016117318166832283}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.02495418432487991}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20220588235294118, "acc_stderr,none": 0.024398192986654924}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3072289156626506, "acc_stderr,none": 0.035915667978246635}, "mmlu_social_sciences": {"acc,none": 0.24114397140071497, "acc_stderr,none": 0.007709949713479085, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518752}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.031911782267135445}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.030031147977641545}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21025641025641026, "acc_stderr,none": 0.020660597485026924}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.27310924369747897, "acc_stderr,none": 0.02894200404099817}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23486238532110093, "acc_stderr,none": 0.01817511051034357}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.018120224251484587}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878285}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2, "acc_stderr,none": 0.02560737598657916}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.029705284056772436}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_stem": {"acc,none": 0.25816682524579765, "acc_stderr,none": 0.007791198405117198, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3111111111111111, "acc_stderr,none": 0.03999262876617722}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137282}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.047609522856952344}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.04440521906179328}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.028504856470514196}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.03695183311650232}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25132275132275134, "acc_stderr,none": 0.022340482339643898}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2806451612903226, "acc_stderr,none": 0.0255606047210229}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.0307127300709826}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.02659393910184407}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2119205298013245, "acc_stderr,none": 0.03336767086567978}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355157}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755805}, "mmlu_pro": {"exact_match,custom-extract": 0.06632313829787234, "exact_match_stderr,custom-extract": 0.0022572432772628185, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.04881450488145049, "exact_match_stderr,custom-extract": 0.008052867301788173}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08871989860583017, "exact_match_stderr,custom-extract": 0.010129158179627853}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.026501766784452298, "exact_match_stderr,custom-extract": 0.004776103123350212}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08292682926829269, "exact_match_stderr,custom-extract": 0.013636027558244172}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.07819905213270142, "exact_match_stderr,custom-extract": 0.009247099534246461}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.030959752321981424, "exact_match_stderr,custom-extract": 0.005567137755376452}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10757946210268948, "exact_match_stderr,custom-extract": 0.010840228436417479}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.06299212598425197, "exact_match_stderr,custom-extract": 0.012463010328276552}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07720254314259764, "exact_match_stderr,custom-extract": 0.008047716247195611}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0458919319022946, "exact_match_stderr,custom-extract": 0.0056950831612678}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.08333333333333333, "exact_match_stderr,custom-extract": 0.009097336226802715}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11422845691382766, "exact_match_stderr,custom-extract": 0.014253888115016508}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.05003849114703618, "exact_match_stderr,custom-extract": 0.006051565814435417}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.09398496240601503, "exact_match_stderr,custom-extract": 0.010336362416948094}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.202, "acc_stderr,none": 0.01797326003128824, "acc_norm,none": 0.32, "acc_norm_stderr,none": 0.020882340488761805}, "piqa": {"alias": "piqa", "acc,none": 0.6447225244831338, "acc_stderr,none": 0.011166464269528293, "acc_norm,none": 0.6360174102285092, "acc_norm_stderr,none": 0.011225875703487171}, "race": {"alias": "race", "acc,none": 0.3196172248803828, "acc_stderr,none": 0.014432497601303535}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.38945752302968273, "acc_stderr,none": 0.011034098821726027}, "winogrande": {"alias": "winogrande", "acc,none": 0.5643251775848461, "acc_stderr,none": 0.013935709739615713}} {"created_at": "2025-04-14T15:07:26.228940", "global_step": 40000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2858361774744027, "acc_stderr,none": 0.013203196088537365, "acc_norm,none": 0.3122866894197952, "acc_norm_stderr,none": 0.013542598541688064}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5681818181818182, "acc_stderr,none": 0.010163945352271723, "acc_norm,none": 0.5382996632996633, "acc_norm_stderr,none": 0.010229639820610517}, "boolq": {"alias": "boolq", "acc,none": 0.6324159021406728, "acc_stderr,none": 0.008432809471149862}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19901719901719903, "acc_stderr,none": 0.011430809442838382}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3625771758613822, "acc_stderr,none": 0.004797616754372309, "acc_norm,none": 0.44991037641904, "acc_norm_stderr,none": 0.004964679845918433}, "mmlu": {"acc,none": 0.24569149693775816, "acc_stderr,none": 0.0036274681552599178, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2405951115834219, "acc_stderr,none": 0.006234126251020665, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.0340150671524904}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501933}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658332}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.041391127276354626}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.044531975073749834}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.03351953879521269}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23410404624277456, "acc_stderr,none": 0.022797110278071128}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19935691318327975, "acc_stderr,none": 0.022691033780549656}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24691358024691357, "acc_stderr,none": 0.023993501709042117}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23859191655801826, "acc_stderr,none": 0.010885929742002223}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.031581495393387345}, "mmlu_other": {"acc,none": 0.2658513035082073, "acc_stderr,none": 0.007912368367651202, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2528301886792453, "acc_stderr,none": 0.02674989977124123}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.19653179190751446, "acc_stderr,none": 0.030299574664788147}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3542600896860987, "acc_stderr,none": 0.03210062154134986}, "mmlu_management": {"alias": " - management", "acc,none": 0.24271844660194175, "acc_stderr,none": 0.04245022486384495}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.028605953702004257}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542126}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2771392081736909, "acc_stderr,none": 0.016005636294122425}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.025058503316958157}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19852941176470587, "acc_stderr,none": 0.02423101337054108}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.034843315926805875}, "mmlu_social_sciences": {"acc,none": 0.23561910952226195, "acc_stderr,none": 0.007629623278004991, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.040493392977481404}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.031156269519646836}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803624}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.027553614467863807}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27155963302752295, "acc_stderr,none": 0.01906909836319144}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596919}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.016906615927288142}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.39090909090909093, "acc_stderr,none": 0.04673752333670237}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.1673469387755102, "acc_stderr,none": 0.02389714476891452}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.20398009950248755, "acc_stderr,none": 0.028493176245326084}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_stem": {"acc,none": 0.2432603869330796, "acc_stderr,none": 0.00762988053085359, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.03853254836552003}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.0315469804508223}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2013888888888889, "acc_stderr,none": 0.0335364746971384}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.03873958714149353}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.030976692998534443}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25161290322580643, "acc_stderr,none": 0.024685979286239963}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.24630541871921183, "acc_stderr,none": 0.03031509928561773}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.027080372815145658}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19907407407407407, "acc_stderr,none": 0.027232298462690242}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467764}, "mmlu_pro": {"exact_match,custom-extract": 0.08219747340425532, "exact_match_stderr,custom-extract": 0.002491320255671332, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.08926080892608089, "exact_match_stderr,custom-extract": 0.010655428293467986}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09632446134347275, "exact_match_stderr,custom-extract": 0.010510211344233737}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.03445229681978799, "exact_match_stderr,custom-extract": 0.005423312815552813}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09024390243902439, "exact_match_stderr,custom-extract": 0.014168039768581504}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11255924170616113, "exact_match_stderr,custom-extract": 0.01088545226217949}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.04643962848297214, "exact_match_stderr,custom-extract": 0.006763644717533301}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11124694376528117, "exact_match_stderr,custom-extract": 0.011000782283753618}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.08923884514435695, "exact_match_stderr,custom-extract": 0.01462471535140518}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10354223433242507, "exact_match_stderr,custom-extract": 0.009186019023092292}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.046632124352331605, "exact_match_stderr,custom-extract": 0.005738600250720587}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11904761904761904, "exact_match_stderr,custom-extract": 0.010659472740112152}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10020040080160321, "exact_match_stderr,custom-extract": 0.013455286690416086}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.06389530408006158, "exact_match_stderr,custom-extract": 0.006788273571883455}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11027568922305764, "exact_match_stderr,custom-extract": 0.011095284901172607}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.212, "acc_stderr,none": 0.018297037004013885, "acc_norm,none": 0.308, "acc_norm_stderr,none": 0.0206670329874661}, "piqa": {"alias": "piqa", "acc,none": 0.6463547334058759, "acc_stderr,none": 0.01115487770818868, "acc_norm,none": 0.6496191512513602, "acc_norm_stderr,none": 0.011131277554681733}, "race": {"alias": "race", "acc,none": 0.32344497607655504, "acc_stderr,none": 0.014477764809417717}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.38178096212896623, "acc_stderr,none": 0.010993277727292054}, "winogrande": {"alias": "winogrande", "acc,none": 0.5619573796369376, "acc_stderr,none": 0.013944181296470804}} {"created_at": "2025-04-14T17:03:20.262280", "global_step": 42000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2935153583617747, "acc_stderr,none": 0.013307250444941124, "acc_norm,none": 0.3438566552901024, "acc_norm_stderr,none": 0.013880644570156208}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5875420875420876, "acc_stderr,none": 0.010101305447864771, "acc_norm,none": 0.5652356902356902, "acc_norm_stderr,none": 0.010172083670402789}, "boolq": {"alias": "boolq", "acc,none": 0.5168195718654435, "acc_stderr,none": 0.008740105658763937}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202903}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.360884285998805, "acc_stderr,none": 0.004792755235823527, "acc_norm,none": 0.4534953196574388, "acc_norm_stderr,none": 0.004968151878211054}, "mmlu": {"acc,none": 0.23956701324597635, "acc_stderr,none": 0.0035998405192511763, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2412327311370882, "acc_stderr,none": 0.006238197119825095, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.038522733649243156}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.032568666616811015}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693264}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2320675105485232, "acc_stderr,none": 0.027479744550808514}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19834710743801653, "acc_stderr,none": 0.03640118271990945}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507437}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.03322015795776741}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123563}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2346368715083799, "acc_stderr,none": 0.014173044098303679}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2090032154340836, "acc_stderr,none": 0.02309314039837422}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2191358024691358, "acc_stderr,none": 0.023016705640262185}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2503259452411995, "acc_stderr,none": 0.011064151027165424}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.035650796707083106}, "mmlu_other": {"acc,none": 0.24010299324106857, "acc_stderr,none": 0.007653430000998079, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252606}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106748}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.1791907514450867, "acc_stderr,none": 0.02924251305906329}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.14, "acc_stderr,none": 0.034873508801977704}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.26905829596412556, "acc_stderr,none": 0.029763779406874975}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646035}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.029058588303748845}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24904214559386972, "acc_stderr,none": 0.015464676163395964}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.024170840879341016}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25886524822695034, "acc_stderr,none": 0.026129572527180848}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19852941176470587, "acc_stderr,none": 0.024231013370541083}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233134}, "mmlu_social_sciences": {"acc,none": 0.2333441663958401, "acc_stderr,none": 0.007628898053733627, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03173071239071724}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.03003114797764154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2128205128205128, "acc_stderr,none": 0.020752423722128006}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.02755361446786381}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21651376146788992, "acc_stderr,none": 0.017658710594443145}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.038808483010823944}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.01770453165325008}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724138}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.025801283475090506}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.03076944496729602}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_stem": {"acc,none": 0.24262607040913417, "acc_stderr,none": 0.00762004840310525, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.32894736842105265, "acc_stderr,none": 0.038234289699266046}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.043898699568087785}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.1574468085106383, "acc_stderr,none": 0.023809905196619706}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.21379310344827587, "acc_stderr,none": 0.03416520447747549}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.022261817692400175}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.024892469172462843}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2019704433497537, "acc_stderr,none": 0.02824735012218027}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655106}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.0347918557259966}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.030058202704309846}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467763}, "mmlu_pro": {"exact_match,custom-extract": 0.0890126329787234, "exact_match_stderr,custom-extract": 0.0025904387246720433, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.06415620641562064, "exact_match_stderr,custom-extract": 0.009157238153838341}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10392902408111533, "exact_match_stderr,custom-extract": 0.010871175856870044}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06713780918727916, "exact_match_stderr,custom-extract": 0.007441509249865644}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07073170731707316, "exact_match_stderr,custom-extract": 0.012676984988696252}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11137440758293839, "exact_match_stderr,custom-extract": 0.010835234740619482}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08152734778121776, "exact_match_stderr,custom-extract": 0.00879522781860575}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1271393643031785, "exact_match_stderr,custom-extract": 0.011654709248697758}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09448818897637795, "exact_match_stderr,custom-extract": 0.0150052772401423}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08446866485013624, "exact_match_stderr,custom-extract": 0.008384710625926236}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06069578090303479, "exact_match_stderr,custom-extract": 0.006498535623285846}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11688311688311688, "exact_match_stderr,custom-extract": 0.010575091539720229}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11422845691382766, "exact_match_stderr,custom-extract": 0.014253888115016499}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.0800615858352579, "exact_match_stderr,custom-extract": 0.0075327599014246675}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10150375939849623, "exact_match_stderr,custom-extract": 0.010697198018805526}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.23, "acc_stderr,none": 0.01883905039112315, "acc_norm,none": 0.332, "acc_norm_stderr,none": 0.021081766571222852}, "piqa": {"alias": "piqa", "acc,none": 0.6425462459194777, "acc_stderr,none": 0.01118169259086765, "acc_norm,none": 0.6452665941240479, "acc_norm_stderr,none": 0.0111626178542803}, "race": {"alias": "race", "acc,none": 0.3435406698564593, "acc_stderr,none": 0.014697475413671399}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.39662231320368474, "acc_stderr,none": 0.011069603691860443}, "winogrande": {"alias": "winogrande", "acc,none": 0.5737963693764798, "acc_stderr,none": 0.013898585965412338}} {"created_at": "2025-04-14T20:57:30.842568", "global_step": 46000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2977815699658703, "acc_stderr,none": 0.013363080107244489, "acc_norm,none": 0.30716723549488056, "acc_norm_stderr,none": 0.013481034054980945}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6014309764309764, "acc_stderr,none": 0.010046455400477947, "acc_norm,none": 0.5765993265993266, "acc_norm_stderr,none": 0.010138671005289052}, "boolq": {"alias": "boolq", "acc,none": 0.636697247706422, "acc_stderr,none": 0.008411885836787166}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202905}, "copa": {"alias": "copa", "acc,none": 0.66, "acc_stderr,none": 0.04760952285695238}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3641704839673372, "acc_stderr,none": 0.004802133511654227, "acc_norm,none": 0.4609639514041028, "acc_norm_stderr,none": 0.004974551179483934}, "mmlu": {"acc,none": 0.25964962256088875, "acc_stderr,none": 0.0036927818010244894, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24314558979808715, "acc_stderr,none": 0.006257558524476579, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.03764950879790606}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.0347769116216366}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.21518987341772153, "acc_stderr,none": 0.026750826994676163}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04065578140908705}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.03755265865037183}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25766871165644173, "acc_stderr,none": 0.03436150827846917}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.022497230190967547}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24022346368715083, "acc_stderr,none": 0.01428834380392532}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.26688102893890675, "acc_stderr,none": 0.025122637608816646}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2654320987654321, "acc_stderr,none": 0.02456922360046085}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23859191655801826, "acc_stderr,none": 0.010885929742002209}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03377310252209196}, "mmlu_other": {"acc,none": 0.2648857418731896, "acc_stderr,none": 0.007880138441408328, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106744}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641144}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.16591928251121077, "acc_stderr,none": 0.02496755319654713}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.04354631077260595}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3034188034188034, "acc_stderr,none": 0.030118210106942662}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24648786717752236, "acc_stderr,none": 0.01541130876968693}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3006535947712418, "acc_stderr,none": 0.02625605383571896}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2801418439716312, "acc_stderr,none": 0.026789172351140235}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3786764705882353, "acc_stderr,none": 0.029465133639776125}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.03329394119073532}, "mmlu_social_sciences": {"acc,none": 0.28339291517712056, "acc_stderr,none": 0.008110553747491133, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.19298245614035087, "acc_stderr,none": 0.037124548537213684}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2828282828282828, "acc_stderr,none": 0.0320877955878675}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.32124352331606215, "acc_stderr,none": 0.033699508685490674}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.34102564102564104, "acc_stderr,none": 0.024035489676335065}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31512605042016806, "acc_stderr,none": 0.030176808288974337}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27155963302752295, "acc_stderr,none": 0.01906909836319145}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.33587786259541985, "acc_stderr,none": 0.041423137719966634}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.01777694715752803}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.027212835884073163}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.30845771144278605, "acc_stderr,none": 0.03265819588512697}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.2559467174119886, "acc_stderr,none": 0.007767440299606333, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.037827289808654685}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2847222222222222, "acc_stderr,none": 0.037738099906869355}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617749}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20851063829787234, "acc_stderr,none": 0.026556982117838725}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.31724137931034485, "acc_stderr,none": 0.03878352372138622}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02256989707491841}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2709677419354839, "acc_stderr,none": 0.02528441611490016}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293752}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712163}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25, "acc_stderr,none": 0.029531221160930918}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.17857142857142858, "acc_stderr,none": 0.03635209121577806}, "mmlu_pro": {"exact_match,custom-extract": 0.08435837765957446, "exact_match_stderr,custom-extract": 0.0025282094808369666, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.07531380753138076, "exact_match_stderr,custom-extract": 0.009862294734961754}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08618504435994931, "exact_match_stderr,custom-extract": 0.009997282783668225}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.054770318021201414, "exact_match_stderr,custom-extract": 0.006765657432918792}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08780487804878048, "exact_match_stderr,custom-extract": 0.013993989404782777}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10545023696682465, "exact_match_stderr,custom-extract": 0.010578211479944621}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.06191950464396285, "exact_match_stderr,custom-extract": 0.0077463320824784476}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11246943765281174, "exact_match_stderr,custom-extract": 0.011053451044082589}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12598425196850394, "exact_match_stderr,custom-extract": 0.017022602638569518}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08991825613079019, "exact_match_stderr,custom-extract": 0.008625172638334896}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05921539600296077, "exact_match_stderr,custom-extract": 0.0064238521314546075}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09956709956709957, "exact_match_stderr,custom-extract": 0.009855592879057077}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10020040080160321, "exact_match_stderr,custom-extract": 0.013455286690416086}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07467282525019246, "exact_match_stderr,custom-extract": 0.0072961138747172005}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11027568922305764, "exact_match_stderr,custom-extract": 0.011095284901172611}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.218, "acc_stderr,none": 0.018483378223178845, "acc_norm,none": 0.322, "acc_norm_stderr,none": 0.020916668330019882}, "piqa": {"alias": "piqa", "acc,none": 0.6490750816104461, "acc_stderr,none": 0.011135250564776792, "acc_norm,none": 0.6403699673558215, "acc_norm_stderr,none": 0.011196669936752598}, "race": {"alias": "race", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.014521924541567923}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.3976458546571136, "acc_stderr,none": 0.01107447277414865}, "winogrande": {"alias": "winogrande", "acc,none": 0.5540647198105761, "acc_stderr,none": 0.013970093482330706}} {"created_at": "2025-04-14T22:46:46.566150", "global_step": 48000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2738907849829352, "acc_stderr,none": 0.013032004972989503, "acc_norm,none": 0.310580204778157, "acc_norm_stderr,none": 0.013522292098053052}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5976430976430976, "acc_stderr,none": 0.010062244711011529, "acc_norm,none": 0.5664983164983165, "acc_norm_stderr,none": 0.010168640625454107}, "boolq": {"alias": "boolq", "acc,none": 0.6507645259938838, "acc_stderr,none": 0.008338033790721216}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20475020475020475, "acc_stderr,none": 0.01155271447787666}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.046882617226215034}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.36675960963951404, "acc_stderr,none": 0.004809352075008935, "acc_norm,none": 0.4683330013941446, "acc_norm_stderr,none": 0.004979763862134985}, "mmlu": {"acc,none": 0.23116365190143853, "acc_stderr,none": 0.0035529903911609715, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24420828905419767, "acc_stderr,none": 0.006263451212909046, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.03852273364924315}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.029554292605695053}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955924}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04065578140908705}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650743}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.023357365785874037}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.01442229220480885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18971061093247588, "acc_stderr,none": 0.022268196258783218}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22839506172839505, "acc_stderr,none": 0.023358211840626267}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.010996156635142692}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824565}, "mmlu_other": {"acc,none": 0.24267782426778242, "acc_stderr,none": 0.00767486338669963, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.046482319871173156}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.02528839450289137}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.03063114553919882}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3094170403587444, "acc_stderr,none": 0.031024411740572192}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.21711366538952745, "acc_stderr,none": 0.014743125394823288}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.024404394928087873}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19852941176470587, "acc_stderr,none": 0.024231013370541087}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511115}, "mmlu_social_sciences": {"acc,none": 0.21904452388690282, "acc_stderr,none": 0.007454951438157205, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551972}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279483}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.2, "acc_stderr,none": 0.017149858514250944}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.01755581809132227}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.1836734693877551, "acc_stderr,none": 0.024789071332007636}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916707}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_stem": {"acc,none": 0.21217887725975262, "acc_stderr,none": 0.0072721812852178545, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.033556772163131396}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19736842105263158, "acc_stderr,none": 0.03238981601699397}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774708}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.22486772486772486, "acc_stderr,none": 0.02150209607822914}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18064516129032257, "acc_stderr,none": 0.02188617856717254}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.16748768472906403, "acc_stderr,none": 0.026273086047535414}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.02504044387700069}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.14814814814814814, "acc_stderr,none": 0.024227629273728363}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.04157751539865629}, "mmlu_pro": {"exact_match,custom-extract": 0.08976063829787234, "exact_match_stderr,custom-extract": 0.002598222735002503, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09483960948396095, "exact_match_stderr,custom-extract": 0.010949672704790078}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09378960709759189, "exact_match_stderr,custom-extract": 0.01038553249235858}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.038869257950530034, "exact_match_stderr,custom-extract": 0.005747289272769011}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08048780487804878, "exact_match_stderr,custom-extract": 0.013451853667809174}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10781990521327015, "exact_match_stderr,custom-extract": 0.01068223063355725}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09184726522187822, "exact_match_stderr,custom-extract": 0.009282712153833536}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1136919315403423, "exact_match_stderr,custom-extract": 0.011105705318793043}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12335958005249344, "exact_match_stderr,custom-extract": 0.016869623436798518}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06721162579473206, "exact_match_stderr,custom-extract": 0.007549486626300604}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06587712805329386, "exact_match_stderr,custom-extract": 0.00675153382039657}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11363636363636363, "exact_match_stderr,custom-extract": 0.010446330904021004}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11422845691382766, "exact_match_stderr,custom-extract": 0.014253888115016523}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09545804464973057, "exact_match_stderr,custom-extract": 0.008156113834567345}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11528822055137844, "exact_match_stderr,custom-extract": 0.011312646389022083}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.23, "acc_stderr,none": 0.018839050391123137, "acc_norm,none": 0.338, "acc_norm_stderr,none": 0.02117566569520941}, "piqa": {"alias": "piqa", "acc,none": 0.64689880304679, "acc_stderr,none": 0.011150983944502311, "acc_norm,none": 0.6436343852013058, "acc_norm_stderr,none": 0.011174109865864732}, "race": {"alias": "race", "acc,none": 0.32248803827751193, "acc_stderr,none": 0.01446655223501507}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41760491299897645, "acc_stderr,none": 0.011159391894922484}, "winogrande": {"alias": "winogrande", "acc,none": 0.5595895816890292, "acc_stderr,none": 0.01395233031191561}} {"created_at": "2025-04-15T00:40:03.520203", "global_step": 50000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2687713310580205, "acc_stderr,none": 0.01295506596371068, "acc_norm,none": 0.3037542662116041, "acc_norm_stderr,none": 0.01343890918477876}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5896464646464646, "acc_stderr,none": 0.010093531255765464, "acc_norm,none": 0.5458754208754208, "acc_norm_stderr,none": 0.010216507710244096}, "boolq": {"alias": "boolq", "acc,none": 0.627217125382263, "acc_stderr,none": 0.00845725586791469}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18427518427518427, "acc_stderr,none": 0.011100060579347624}, "copa": {"alias": "copa", "acc,none": 0.61, "acc_stderr,none": 0.04902071300001975}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3639713204540928, "acc_stderr,none": 0.004801572028920793, "acc_norm,none": 0.46046604262099183, "acc_norm_stderr,none": 0.004974159561342697}, "mmlu": {"acc,none": 0.2621421449935907, "acc_stderr,none": 0.0037072932353908856, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25802337938363445, "acc_stderr,none": 0.00637984258363529, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.040735243221471255}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.036639749943912434}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693285}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24472573839662448, "acc_stderr,none": 0.027985699387036416}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.32231404958677684, "acc_stderr,none": 0.042664163633521664}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.023786203255508287}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574877}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2733118971061093, "acc_stderr,none": 0.02531176597542612}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25308641975308643, "acc_stderr,none": 0.024191808600713002}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2588005215123859, "acc_stderr,none": 0.011186109046564604}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23391812865497075, "acc_stderr,none": 0.03246721765117826}, "mmlu_other": {"acc,none": 0.24975860959124557, "acc_stderr,none": 0.007743850680256737, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.026616482980501704}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.033917503223216586}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.16143497757847533, "acc_stderr,none": 0.02469395789912846}, "mmlu_management": {"alias": " - management", "acc,none": 0.3883495145631068, "acc_stderr,none": 0.0482572933735639}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623102}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24393358876117496, "acc_stderr,none": 0.015357212665829475}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2973856209150327, "acc_stderr,none": 0.026173908506718576}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.02564555362226673}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.25, "acc_stderr,none": 0.026303648393696036}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.22289156626506024, "acc_stderr,none": 0.032400048255946876}, "mmlu_social_sciences": {"acc,none": 0.26811829704257395, "acc_stderr,none": 0.007982991950952655, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.31313131313131315, "acc_stderr,none": 0.033042050878136525}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2846153846153846, "acc_stderr,none": 0.022878322799706287}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24369747899159663, "acc_stderr,none": 0.027886828078380572}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.29541284403669726, "acc_stderr,none": 0.019560619182976}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467765}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.017322789207784326}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.040139645540727714}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2693877551020408, "acc_stderr,none": 0.02840125202902294}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2935323383084577, "acc_stderr,none": 0.03220024104534205}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.2746590548683793, "acc_stderr,none": 0.007940129787141537, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.037857144650666544}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351585}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462457}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768081}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.047609522856952365}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.04389869956808777}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.225531914893617, "acc_stderr,none": 0.027321078417387533}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.023456037383982033}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3032258064516129, "acc_stderr,none": 0.026148685930671746}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0317852971064275}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036624}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712166}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.03479185572599662}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3425925925925926, "acc_stderr,none": 0.032365852526021574}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "mmlu_pro": {"exact_match,custom-extract": 0.08768284574468085, "exact_match_stderr,custom-extract": 0.0025733616791018743, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.07670850767085077, "exact_match_stderr,custom-extract": 0.009945684370631551}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10266159695817491, "exact_match_stderr,custom-extract": 0.01081232338068659}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06802120141342756, "exact_match_stderr,custom-extract": 0.00748675916800453}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0975609756097561, "exact_match_stderr,custom-extract": 0.014671865834334245}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1066350710900474, "exact_match_stderr,custom-extract": 0.010630426613851857}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0825593395252838, "exact_match_stderr,custom-extract": 0.00884574505399991}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12102689486552567, "exact_match_stderr,custom-extract": 0.011410842488489005}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11286089238845144, "exact_match_stderr,custom-extract": 0.016232140903461437}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.05994550408719346, "exact_match_stderr,custom-extract": 0.007157452608481624}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05995558845299778, "exact_match_stderr,custom-extract": 0.006461333188096238}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11038961038961038, "exact_match_stderr,custom-extract": 0.010314856083401155}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10821643286573146, "exact_match_stderr,custom-extract": 0.013920719044718378}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08314087759815242, "exact_match_stderr,custom-extract": 0.007663395880276797}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.09899749373433583, "exact_match_stderr,custom-extract": 0.010579032194343373}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.208, "acc_stderr,none": 0.01816954222122988, "acc_norm,none": 0.318, "acc_norm_stderr,none": 0.02084757162081401}, "piqa": {"alias": "piqa", "acc,none": 0.6409140369967355, "acc_stderr,none": 0.011192949073844117, "acc_norm,none": 0.6490750816104461, "acc_norm_stderr,none": 0.011135250564776787}, "race": {"alias": "race", "acc,none": 0.3339712918660287, "acc_stderr,none": 0.014596569299709724}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.39201637666325484, "acc_stderr,none": 0.011047065375041742}, "winogrande": {"alias": "winogrande", "acc,none": 0.5698500394632992, "acc_stderr,none": 0.013914685094716698}} {"created_at": "2025-04-15T05:57:03.480936", "global_step": 52000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2790102389078498, "acc_stderr,none": 0.013106784883601343, "acc_norm,none": 0.3054607508532423, "acc_norm_stderr,none": 0.013460080478002501}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5791245791245792, "acc_stderr,none": 0.010130502164066333, "acc_norm,none": 0.5437710437710438, "acc_norm_stderr,none": 0.01022039438372202}, "boolq": {"alias": "boolq", "acc,none": 0.5293577981651376, "acc_stderr,none": 0.008729967580199222}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20065520065520065, "acc_stderr,none": 0.011466011466011559}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.04725815626252607}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.36675960963951404, "acc_stderr,none": 0.004809352075008941, "acc_norm,none": 0.46116311491734713, "acc_norm_stderr,none": 0.004974706428434287}, "mmlu": {"acc,none": 0.24334140435835352, "acc_stderr,none": 0.0036189240588498065, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2452709883103082, "acc_stderr,none": 0.006273391341368885, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.03932537680392871}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.0340150671524904}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.20098039215686275, "acc_stderr,none": 0.028125972265654362}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059802}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070418}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.04524596007030048}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.22832369942196531, "acc_stderr,none": 0.02259870380432162}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2347266881028939, "acc_stderr,none": 0.024071805887677045}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.02378858355165854}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24771838331160365, "acc_stderr,none": 0.011025499291443737}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824565}, "mmlu_other": {"acc,none": 0.2539427100096556, "acc_stderr,none": 0.007809711096303094, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.02661648298050171}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.1791907514450867, "acc_stderr,none": 0.029242513059063287}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2914798206278027, "acc_stderr,none": 0.0305002831765459}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.04453254836326467}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.029480360549541194}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25287356321839083, "acc_stderr,none": 0.015543377313719681}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.02463004897982479}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.02576725201085596}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.26506024096385544, "acc_stderr,none": 0.03436024037944967}, "mmlu_social_sciences": {"acc,none": 0.23724406889827754, "acc_stderr,none": 0.007666773382228974, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281336}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23232323232323232, "acc_stderr,none": 0.030088629490217483}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.17616580310880828, "acc_stderr,none": 0.027493504244548047}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21025641025641026, "acc_stderr,none": 0.020660597485026928}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279483}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26055045871559634, "acc_stderr,none": 0.018819182034850068}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2748091603053435, "acc_stderr,none": 0.03915345408847836}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2565359477124183, "acc_stderr,none": 0.01766784161237899}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721377}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.025206963154225395}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208954}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.23596574690770694, "acc_stderr,none": 0.007544220711482571, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.16296296296296298, "acc_stderr,none": 0.031905414744828414}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952924}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171453}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2723404255319149, "acc_stderr,none": 0.0291012906983867}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23544973544973544, "acc_stderr,none": 0.02185150982203172}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1935483870967742, "acc_stderr,none": 0.022475258525536057}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.18226600985221675, "acc_stderr,none": 0.02716334085964515}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22592592592592592, "acc_stderr,none": 0.025497532639609546}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2119205298013245, "acc_stderr,none": 0.03336767086567977}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.17592592592592593, "acc_stderr,none": 0.02596742095825853}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.36607142857142855, "acc_stderr,none": 0.04572372358737431}, "mmlu_pro": {"exact_match,custom-extract": 0.09217087765957446, "exact_match_stderr,custom-extract": 0.0026280225156442585, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.08507670850767085, "exact_match_stderr,custom-extract": 0.010426562968267428}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09885931558935361, "exact_match_stderr,custom-extract": 0.01063266154407548}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.04328621908127209, "exact_match_stderr,custom-extract": 0.006051102910695472}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1024390243902439, "exact_match_stderr,custom-extract": 0.014993500684238464}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10781990521327015, "exact_match_stderr,custom-extract": 0.010682230633557242}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07636738906088751, "exact_match_stderr,custom-extract": 0.00853622633668934}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12102689486552567, "exact_match_stderr,custom-extract": 0.011410842488489005}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12073490813648294, "exact_match_stderr,custom-extract": 0.016714159620683268}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07811080835603997, "exact_match_stderr,custom-extract": 0.008090932633847634}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07105847520355292, "exact_match_stderr,custom-extract": 0.006992544617386945}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11147186147186147, "exact_match_stderr,custom-extract": 0.010358988935082428}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1342685370741483, "exact_match_stderr,custom-extract": 0.015277913884522426}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08237105465742879, "exact_match_stderr,custom-extract": 0.007631036296293894}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13784461152882205, "exact_match_stderr,custom-extract": 0.012211204647685938}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.218, "acc_stderr,none": 0.01848337822317885, "acc_norm,none": 0.332, "acc_norm_stderr,none": 0.021081766571222856}, "piqa": {"alias": "piqa", "acc,none": 0.6425462459194777, "acc_stderr,none": 0.01118169259086765, "acc_norm,none": 0.6496191512513602, "acc_norm_stderr,none": 0.01113127755468173}, "race": {"alias": "race", "acc,none": 0.3138755980861244, "acc_stderr,none": 0.014362497295239087}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.387410440122825, "acc_stderr,none": 0.011023495621289047}, "winogrande": {"alias": "winogrande", "acc,none": 0.5651144435674822, "acc_stderr,none": 0.013932814110418029}} {"created_at": "2025-04-15T06:00:52.862626", "global_step": 54000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.26621160409556316, "acc_stderr,none": 0.012915774781523207, "acc_norm,none": 0.3174061433447099, "acc_norm_stderr,none": 0.01360223908803817}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5521885521885522, "acc_stderr,none": 0.010203742451111534, "acc_norm,none": 0.5281986531986532, "acc_norm_stderr,none": 0.010243454104071788}, "boolq": {"alias": "boolq", "acc,none": 0.5837920489296636, "acc_stderr,none": 0.008621380519419275}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2031122031122031, "acc_stderr,none": 0.011518254793634086}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.37343158733320053, "acc_stderr,none": 0.004827266662144026, "acc_norm,none": 0.46614220274845647, "acc_norm_stderr,none": 0.0049783281907755245}, "mmlu": {"acc,none": 0.23842757441959836, "acc_stderr,none": 0.0035925968460763357, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2450584484590861, "acc_stderr,none": 0.006268125235732799, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.035670166752768656}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.033175059300091805}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.03019028245350194}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.23628691983122363, "acc_stderr,none": 0.027652153144159274}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3140495867768595, "acc_stderr,none": 0.042369647530410184}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.04453197507374984}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.023532925431044276}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480747}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.022899162918445796}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25749674054758803, "acc_stderr,none": 0.011167706014904173}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.27485380116959063, "acc_stderr,none": 0.03424042924691584}, "mmlu_other": {"acc,none": 0.2481493401995494, "acc_stderr,none": 0.007729780415945686, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.046482319871173156}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.025447863825108625}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.037601780060266196}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.31196581196581197, "acc_stderr,none": 0.030351527323344948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2567049808429119, "acc_stderr,none": 0.015620480263064519}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.0239291555173513}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.025257861359432414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.023157468308559328}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.22716932076698082, "acc_stderr,none": 0.007561928448483219, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.029126522834586815}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.02925282329180363}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462874}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.02665353159671548}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21100917431192662, "acc_stderr,none": 0.017493922404112648}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728743}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.017555818091322277}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878284}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22448979591836735, "acc_stderr,none": 0.02671143055553842}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916707}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_stem": {"acc,none": 0.22993973993022518, "acc_stderr,none": 0.007484777763612005, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.03633384414073465}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21710526315789475, "acc_stderr,none": 0.03355045304882924}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.12, "acc_stderr,none": 0.032659863237109066}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816508}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364395}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.028809989854102963}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.22486772486772486, "acc_stderr,none": 0.021502096078229147}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.19032258064516128, "acc_stderr,none": 0.02233170761182307}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2315270935960591, "acc_stderr,none": 0.029678333141444458}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.02578787422095932}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.035118075718047245}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.024536326026134203}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25, "acc_stderr,none": 0.04109974682633932}, "mmlu_pro": {"exact_match,custom-extract": 0.09109042553191489, "exact_match_stderr,custom-extract": 0.002617272398373614, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.058577405857740586, "exact_match_stderr,custom-extract": 0.008776085886807549}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10266159695817491, "exact_match_stderr,custom-extract": 0.010812323380686585}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05742049469964664, "exact_match_stderr,custom-extract": 0.006917690995369925}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09024390243902439, "exact_match_stderr,custom-extract": 0.014168039768581515}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11255924170616113, "exact_match_stderr,custom-extract": 0.01088545226217949}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07120743034055728, "exact_match_stderr,custom-extract": 0.008265789561265736}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11124694376528117, "exact_match_stderr,custom-extract": 0.01100078228375362}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11548556430446194, "exact_match_stderr,custom-extract": 0.01639549430578109}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07629427792915532, "exact_match_stderr,custom-extract": 0.008004172836965893}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07327905255366396, "exact_match_stderr,custom-extract": 0.007092470342788472}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11147186147186147, "exact_match_stderr,custom-extract": 0.010358988935082428}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11823647294589178, "exact_match_stderr,custom-extract": 0.014468953704661754}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10238645111624327, "exact_match_stderr,custom-extract": 0.008414505495298189}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11779448621553884, "exact_match_stderr,custom-extract": 0.011418740524805808}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.23, "acc_stderr,none": 0.01883905039112314, "acc_norm,none": 0.324, "acc_norm_stderr,none": 0.020950557312477445}, "piqa": {"alias": "piqa", "acc,none": 0.6441784548422198, "acc_stderr,none": 0.011170294934656948, "acc_norm,none": 0.6436343852013058, "acc_norm_stderr,none": 0.011174109865864727}, "race": {"alias": "race", "acc,none": 0.3167464114832536, "acc_stderr,none": 0.014397814139910632}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.3889457523029683, "acc_stderr,none": 0.011031467212127695}, "winogrande": {"alias": "winogrande", "acc,none": 0.5722178374112076, "acc_stderr,none": 0.013905134013839944}} {"created_at": "2025-04-15T06:19:49.351467", "global_step": 56000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2815699658703072, "acc_stderr,none": 0.013143376735009017, "acc_norm,none": 0.3302047781569966, "acc_norm_stderr,none": 0.013743085603760427}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6153198653198653, "acc_stderr,none": 0.009983171707009004, "acc_norm,none": 0.5951178451178452, "acc_norm_stderr,none": 0.0100724239603957}, "boolq": {"alias": "boolq", "acc,none": 0.6162079510703364, "acc_stderr,none": 0.008505584729104971}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21048321048321048, "acc_stderr,none": 0.011671038436522906}, "copa": {"alias": "copa", "acc,none": 0.66, "acc_stderr,none": 0.04760952285695237}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3703445528779128, "acc_stderr,none": 0.004819100456867818, "acc_norm,none": 0.4676359290977893, "acc_norm_stderr,none": 0.00497931751543253}, "mmlu": {"acc,none": 0.26684233015239994, "acc_stderr,none": 0.003716011388944678, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2503719447396387, "acc_stderr,none": 0.006309701327096764, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.35714285714285715, "acc_stderr,none": 0.04285714285714281}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.029771775228145652}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.23628691983122363, "acc_stderr,none": 0.027652153144159267}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.040261875275912046}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25766871165644173, "acc_stderr,none": 0.03436150827846917}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.023357365785874037}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.27262569832402234, "acc_stderr,none": 0.01489339173524962}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24758842443729903, "acc_stderr,none": 0.024513879973621967}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.023788583551658544}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24641460234680573, "acc_stderr,none": 0.011005971399927247}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.14619883040935672, "acc_stderr,none": 0.02709729011807082}, "mmlu_other": {"acc,none": 0.251689732861281, "acc_stderr,none": 0.007746620422296156, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.027495663683724064}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3352601156069364, "acc_stderr,none": 0.03599586301247077}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2062780269058296, "acc_stderr,none": 0.027157150479563824}, "mmlu_management": {"alias": " - management", "acc,none": 0.34951456310679613, "acc_stderr,none": 0.047211885060971716}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.19658119658119658, "acc_stderr,none": 0.026035386098951292}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.014866821664709597}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.025261691219729484}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290396}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.34558823529411764, "acc_stderr,none": 0.02888819310398864}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.16265060240963855, "acc_stderr,none": 0.02873023789261379}, "mmlu_social_sciences": {"acc,none": 0.2885927851803705, "acc_stderr,none": 0.008147540763676405, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.35353535353535354, "acc_stderr,none": 0.03406086723547153}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3160621761658031, "acc_stderr,none": 0.033553973696861736}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3, "acc_stderr,none": 0.023234581088428494}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31512605042016806, "acc_stderr,none": 0.030176808288974337}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3412844036697248, "acc_stderr,none": 0.020328612816592432}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.20610687022900764, "acc_stderr,none": 0.03547771004159464}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.017479487001364764}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.041220665028782855}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2816326530612245, "acc_stderr,none": 0.028795185574291282}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.02970528405677243}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.2851252775134792, "acc_stderr,none": 0.007984063091487928, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.26973684210526316, "acc_stderr,none": 0.036117805602848975}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.37254901960784315, "acc_stderr,none": 0.04810840148082636}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.028809989854102967}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776557}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3, "acc_stderr,none": 0.026069362295335134}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.26108374384236455, "acc_stderr,none": 0.030903796952114475}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.304635761589404, "acc_stderr,none": 0.037579499229433426}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.44907407407407407, "acc_stderr,none": 0.03392238405321617}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.16964285714285715, "acc_stderr,none": 0.0356236785009539}, "mmlu_pro": {"exact_match,custom-extract": 0.09483045212765957, "exact_match_stderr,custom-extract": 0.0026641752250014224, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09344490934449093, "exact_match_stderr,custom-extract": 0.010877232530238947}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10266159695817491, "exact_match_stderr,custom-extract": 0.010812323380686578}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06978798586572438, "exact_match_stderr,custom-extract": 0.007576175072607233}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07073170731707316, "exact_match_stderr,custom-extract": 0.012676984988696263}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11966824644549763, "exact_match_stderr,custom-extract": 0.011178894558789454}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09494324045407637, "exact_match_stderr,custom-extract": 0.00942176471567824}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.0941320293398533, "exact_match_stderr,custom-extract": 0.010216217601013355}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10498687664041995, "exact_match_stderr,custom-extract": 0.015724991203554566}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06902815622161672, "exact_match_stderr,custom-extract": 0.007643373236177118}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06365655070318282, "exact_match_stderr,custom-extract": 0.006644652222081488}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12987012987012986, "exact_match_stderr,custom-extract": 0.011064857512116033}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14829659318637275, "exact_match_stderr,custom-extract": 0.015925574493977506}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09930715935334873, "exact_match_stderr,custom-extract": 0.00830120786199475}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11278195488721804, "exact_match_stderr,custom-extract": 0.011204844440903066}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.26, "acc_stderr,none": 0.019635965529725512, "acc_norm,none": 0.346, "acc_norm_stderr,none": 0.021294951277234637}, "piqa": {"alias": "piqa", "acc,none": 0.6479869423286181, "acc_stderr,none": 0.011143148953066088, "acc_norm,none": 0.6588683351468988, "acc_norm_stderr,none": 0.011061289443962705}, "race": {"alias": "race", "acc,none": 0.32057416267942584, "acc_stderr,none": 0.014443918794282801}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40583418628454454, "acc_stderr,none": 0.011111610832965833}, "winogrande": {"alias": "winogrande", "acc,none": 0.5674822415153907, "acc_stderr,none": 0.013923911578623832}} {"created_at": "2025-04-15T07:33:56.257302", "global_step": 58000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2901023890784983, "acc_stderr,none": 0.013261573677520766, "acc_norm,none": 0.3148464163822526, "acc_norm_stderr,none": 0.013572657703084948}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5782828282828283, "acc_stderr,none": 0.010133255284012325, "acc_norm,none": 0.5635521885521886, "acc_norm_stderr,none": 0.01017656998011104}, "boolq": {"alias": "boolq", "acc,none": 0.635474006116208, "acc_stderr,none": 0.008417937294741665}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.23423423423423423, "acc_stderr,none": 0.012125320943764539}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.046882617226215034}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3663612826130253, "acc_stderr,none": 0.004808251269682432, "acc_norm,none": 0.46634136626170086, "acc_norm_stderr,none": 0.004978462690966934}, "mmlu": {"acc,none": 0.24341261928500213, "acc_stderr,none": 0.003616572676331962, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.251009564293305, "acc_stderr,none": 0.006317677005272109, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.31746031746031744, "acc_stderr,none": 0.0416345303130286}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.03192271569548299}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.029771775228145638}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.02917868230484255}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.264804469273743, "acc_stderr,none": 0.01475690648326066}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2345679012345679, "acc_stderr,none": 0.023576881744005726}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25488917861799215, "acc_stderr,none": 0.011130509812662967}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824565}, "mmlu_other": {"acc,none": 0.2603797875764403, "acc_stderr,none": 0.007856418368382261, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.025447863825108625}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.03063114553919882}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.34080717488789236, "acc_stderr,none": 0.0318114974705536}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.02948036054954119}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2541507024265645, "acc_stderr,none": 0.015569254692045769}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.02473998135511359}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.026244920349843007}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20955882352941177, "acc_stderr,none": 0.024723110407677072}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.2300942476438089, "acc_stderr,none": 0.0075905704574155625, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21717171717171718, "acc_stderr,none": 0.029376616484945633}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.02869787397186069}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23076923076923078, "acc_stderr,none": 0.02136202772522272}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176892}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21834862385321102, "acc_stderr,none": 0.01771260052872272}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.29770992366412213, "acc_stderr,none": 0.04010358942462203}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24673202614379086, "acc_stderr,none": 0.017440820367402507}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072773}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.025206963154225423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.21890547263681592, "acc_stderr,none": 0.029239174636647}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.22835394862036157, "acc_stderr,none": 0.007465201907025515, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.03279000406310049}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252606}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.02937917046412482}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2, "acc_stderr,none": 0.03333333333333329}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2328042328042328, "acc_stderr,none": 0.02176596167215453}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22258064516129034, "acc_stderr,none": 0.023664216671642518}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15763546798029557, "acc_stderr,none": 0.025639014131172404}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.025787874220959326}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2185430463576159, "acc_stderr,none": 0.033742355504256936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.02699145450203673}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291517}, "mmlu_pro": {"exact_match,custom-extract": 0.08086768617021277, "exact_match_stderr,custom-extract": 0.002479836787998264, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.08507670850767085, "exact_match_stderr,custom-extract": 0.010426562968267427}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09505703422053231, "exact_match_stderr,custom-extract": 0.010448155452886217}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.03445229681978799, "exact_match_stderr,custom-extract": 0.005423312815552823}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0951219512195122, "exact_match_stderr,custom-extract": 0.014506870947377817}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10071090047393365, "exact_match_stderr,custom-extract": 0.010365114807941408}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.054695562435500514, "exact_match_stderr,custom-extract": 0.007308432091333975}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.09535452322738386, "exact_match_stderr,custom-extract": 0.010275402181758791}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.08923884514435695, "exact_match_stderr,custom-extract": 0.01462471535140517}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07175295186194369, "exact_match_stderr,custom-extract": 0.007781356843650025}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07994078460399703, "exact_match_stderr,custom-extract": 0.007381170014696002}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10064935064935066, "exact_match_stderr,custom-extract": 0.00990305439291376}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10020040080160321, "exact_match_stderr,custom-extract": 0.013455286690416082}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.0731331793687452, "exact_match_stderr,custom-extract": 0.007226509015118026}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10526315789473684, "exact_match_stderr,custom-extract": 0.010870679630989288}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.212, "acc_stderr,none": 0.018297037004013885, "acc_norm,none": 0.328, "acc_norm_stderr,none": 0.021017027165175492}, "piqa": {"alias": "piqa", "acc,none": 0.6545157780195865, "acc_stderr,none": 0.01109480289361776, "acc_norm,none": 0.6572361262241567, "acc_norm_stderr,none": 0.011073978007039315}, "race": {"alias": "race", "acc,none": 0.3129186602870813, "acc_stderr,none": 0.014350583456012766}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40583418628454454, "acc_stderr,none": 0.011111610832965836}, "winogrande": {"alias": "winogrande", "acc,none": 0.5674822415153907, "acc_stderr,none": 0.013923911578623835}} {"created_at": "2025-04-15T15:24:31.082971", "global_step": 60000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.26621160409556316, "acc_stderr,none": 0.012915774781523212, "acc_norm,none": 0.30119453924914674, "acc_norm_stderr,none": 0.013406741767847626}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5534511784511784, "acc_stderr,none": 0.010200990076245321, "acc_norm,none": 0.5231481481481481, "acc_norm_stderr,none": 0.010248782484554478}, "boolq": {"alias": "boolq", "acc,none": 0.5874617737003058, "acc_stderr,none": 0.008610223886822886}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20475020475020475, "acc_stderr,none": 0.011552714477876674}, "copa": {"alias": "copa", "acc,none": 0.64, "acc_stderr,none": 0.048241815132442176}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.37203744274048994, "acc_stderr,none": 0.004823604775015905, "acc_norm,none": 0.4675363473411671, "acc_norm_stderr,none": 0.004979252954977318}, "mmlu": {"acc,none": 0.24148981626548924, "acc_stderr,none": 0.0036090021253768964, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24293304994686504, "acc_stderr,none": 0.006252883671429119, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.03809523809523812}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.20606060606060606, "acc_stderr,none": 0.031584153240477086}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693247}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.32231404958677684, "acc_stderr,none": 0.04266416363352168}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04557239513497752}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23410404624277456, "acc_stderr,none": 0.02279711027807113}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24022346368715083, "acc_stderr,none": 0.014288343803925307}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2282958199356913, "acc_stderr,none": 0.023839303311398215}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2345679012345679, "acc_stderr,none": 0.02357688174400572}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24315514993481094, "acc_stderr,none": 0.01095655665441735}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946956}, "mmlu_other": {"acc,none": 0.25587383327969104, "acc_stderr,none": 0.00781609705108404, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.02661648298050171}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3273542600896861, "acc_stderr,none": 0.03149384670994131}, "mmlu_management": {"alias": " - management", "acc,none": 0.3106796116504854, "acc_stderr,none": 0.0458212416016155}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23504273504273504, "acc_stderr,none": 0.02777883590493543}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26947637292464877, "acc_stderr,none": 0.015866243073215068}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912258}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.025645553622266733}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064536}, "mmlu_social_sciences": {"acc,none": 0.2382190445238869, "acc_stderr,none": 0.007676381360457073, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.03835153954399419}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.03115626951964684}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803624}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2358974358974359, "acc_stderr,none": 0.021525965407408726}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.02702543349888238}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26055045871559634, "acc_stderr,none": 0.01881918203485007}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306085}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.017740899509177795}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.04309118709946458}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.17142857142857143, "acc_stderr,none": 0.02412746346265014}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.19900497512437812, "acc_stderr,none": 0.02823136509275841}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_stem": {"acc,none": 0.22835394862036157, "acc_stderr,none": 0.007471692051557583, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.03355677216313142}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.043364327079931736}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.24680851063829787, "acc_stderr,none": 0.0281854413012341}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.036001056927277716}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.022019080012217893}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23225806451612904, "acc_stderr,none": 0.024022256130308235}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22660098522167488, "acc_stderr,none": 0.02945486383529296}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1574074074074074, "acc_stderr,none": 0.024837173518242387}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.08726728723404255, "exact_match_stderr,custom-extract": 0.0025645353210073138, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11157601115760112, "exact_match_stderr,custom-extract": 0.011766276311081415}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10139416983523447, "exact_match_stderr,custom-extract": 0.01075295922902333}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05653710247349823, "exact_match_stderr,custom-extract": 0.0068674876015198645}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.05609756097560976, "exact_match_stderr,custom-extract": 0.01137820855397697}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11255924170616113, "exact_match_stderr,custom-extract": 0.01088545226217949}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.06398348813209494, "exact_match_stderr,custom-extract": 0.007865711850285468}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1136919315403423, "exact_match_stderr,custom-extract": 0.011105705318793027}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09186351706036745, "exact_match_stderr,custom-extract": 0.014816829983934033}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06630336058128973, "exact_match_stderr,custom-extract": 0.0075019527734704085}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06661732050333087, "exact_match_stderr,custom-extract": 0.006786667382246598}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11580086580086581, "exact_match_stderr,custom-extract": 0.010532466716077556}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14428857715430862, "exact_match_stderr,custom-extract": 0.015745808625512836}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07621247113163972, "exact_match_stderr,custom-extract": 0.007364812980869267}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.09649122807017543, "exact_match_stderr,custom-extract": 0.010458777957100894}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.218, "acc_stderr,none": 0.01848337822317885, "acc_norm,none": 0.334, "acc_norm_stderr,none": 0.02111349234774373}, "piqa": {"alias": "piqa", "acc,none": 0.6539717083786725, "acc_stderr,none": 0.011098919626957382, "acc_norm,none": 0.6550598476605005, "acc_norm_stderr,none": 0.011090670102993147}, "race": {"alias": "race", "acc,none": 0.3167464114832536, "acc_stderr,none": 0.014397814139910632}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.394575230296827, "acc_stderr,none": 0.011059713589720794}, "winogrande": {"alias": "winogrande", "acc,none": 0.5761641673243884, "acc_stderr,none": 0.013888492389944515}} {"created_at": "2025-04-15T17:47:21.794463", "global_step": 64000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2781569965870307, "acc_stderr,none": 0.013094469919538802, "acc_norm,none": 0.31569965870307165, "acc_norm_stderr,none": 0.013582571095815293}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6035353535353535, "acc_stderr,none": 0.010037412763064527, "acc_norm,none": 0.5694444444444444, "acc_norm_stderr,none": 0.010160345396860077}, "boolq": {"alias": "boolq", "acc,none": 0.5574923547400612, "acc_stderr,none": 0.008687051315181372}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.17936117936117937, "acc_stderr,none": 0.010983995312829129}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.37502489543915557, "acc_stderr,none": 0.004831399218500239, "acc_norm,none": 0.47759410476000796, "acc_norm_stderr,none": 0.00498476891232693}, "mmlu": {"acc,none": 0.2335849594074918, "acc_stderr,none": 0.0035642023482474034, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24803400637619555, "acc_stderr,none": 0.0062895149846376, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.041049472699033945}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.03192271569548301}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.03077855467869327}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.02931281415395592}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302871}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.02317629820399201}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.01426555419233115}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.02197419884826582}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.02240967454730419}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26401564537157757, "acc_stderr,none": 0.011258435537723818}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.24396523978113938, "acc_stderr,none": 0.00769181247759513, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.046482319871173156}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21132075471698114, "acc_stderr,none": 0.025125766484827845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.14, "acc_stderr,none": 0.034873508801977704}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3094170403587444, "acc_stderr,none": 0.031024411740572192}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.029480360549541194}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24010217113665389, "acc_stderr,none": 0.015274685213734195}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888142}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.02564555362226673}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23161764705882354, "acc_stderr,none": 0.025626533803777562}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.2216444588885278, "acc_stderr,none": 0.007484283806251852, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1717171717171717, "acc_stderr,none": 0.026869716187429917}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21761658031088082, "acc_stderr,none": 0.02977866303775295}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.02102067268082791}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1981651376146789, "acc_stderr,none": 0.01709057380421788}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.017630827375148383}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.19090909090909092, "acc_stderr,none": 0.03764425585984926}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916714}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.2134475103076435, "acc_stderr,none": 0.007284773335886069, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.1931216931216931, "acc_stderr,none": 0.02033053816003564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1935483870967742, "acc_stderr,none": 0.02247525852553606}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15763546798029557, "acc_stderr,none": 0.025639014131172404}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.02504044387700069}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.09059175531914894, "exact_match_stderr,custom-extract": 0.0026047242690335442, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11576011157601115, "exact_match_stderr,custom-extract": 0.01195660847542174}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10519645120405577, "exact_match_stderr,custom-extract": 0.010929524923270307}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05035335689045936, "exact_match_stderr,custom-extract": 0.006502254002193102}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.10975609756097561, "exact_match_stderr,custom-extract": 0.015456358358757421}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10781990521327015, "exact_match_stderr,custom-extract": 0.010682230633557235}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07533539731682147, "exact_match_stderr,custom-extract": 0.008483088136190937}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11124694376528117, "exact_match_stderr,custom-extract": 0.011000782283753608}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11023622047244094, "exact_match_stderr,custom-extract": 0.016065998434778163}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07266121707538602, "exact_match_stderr,custom-extract": 0.007826619182374837}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.037009622501850484, "exact_match_stderr,custom-extract": 0.0051380865687762}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10822510822510822, "exact_match_stderr,custom-extract": 0.010225646711914642}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13226452905811623, "exact_match_stderr,custom-extract": 0.015181011139551245}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09699769053117784, "exact_match_stderr,custom-extract": 0.00821462573306637}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12907268170426064, "exact_match_stderr,custom-extract": 0.011876239922954071}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.212, "acc_stderr,none": 0.01829703700401389, "acc_norm,none": 0.298, "acc_norm_stderr,none": 0.02047511809298896}, "piqa": {"alias": "piqa", "acc,none": 0.6577801958650707, "acc_stderr,none": 0.011069764658685453, "acc_norm,none": 0.6409140369967355, "acc_norm_stderr,none": 0.011192949073844115}, "race": {"alias": "race", "acc,none": 0.3167464114832536, "acc_stderr,none": 0.01439781413991062}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4022517911975435, "acc_stderr,none": 0.011095758951308076}, "winogrande": {"alias": "winogrande", "acc,none": 0.5611681136543015, "acc_stderr,none": 0.013946933444507032}} {"created_at": "2025-04-15T18:30:26.271282", "global_step": 70000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.29692832764505117, "acc_stderr,none": 0.01335202597672522, "acc_norm,none": 0.33276450511945393, "acc_norm_stderr,none": 0.013769863046192304}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6039562289562289, "acc_stderr,none": 0.010035580962097947, "acc_norm,none": 0.5736531986531986, "acc_norm_stderr,none": 0.010147858603835137}, "boolq": {"alias": "boolq", "acc,none": 0.6235474006116208, "acc_stderr,none": 0.008473882279194588}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20802620802620803, "acc_stderr,none": 0.01162075957565237}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.37502489543915557, "acc_stderr,none": 0.004831399218500234, "acc_norm,none": 0.4753037243576977, "acc_norm_stderr,none": 0.004983691099110915}, "mmlu": {"acc,none": 0.26463466742629255, "acc_stderr,none": 0.0037105817360857355, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2454835281615303, "acc_stderr,none": 0.006273960565733806, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.31746031746031744, "acc_stderr,none": 0.04163453031302859}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03477691162163659}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604246}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24050632911392406, "acc_stderr,none": 0.027820781981149685}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1652892561983471, "acc_stderr,none": 0.03390780612972776}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.044531975073749834}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.03322015795776741}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2446927374301676, "acc_stderr,none": 0.01437816988409842}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2347266881028939, "acc_stderr,none": 0.024071805887677048}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.022899162918445796}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25097783572359844, "acc_stderr,none": 0.011073730299187241}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.03158149539338733}, "mmlu_other": {"acc,none": 0.25458641776633406, "acc_stderr,none": 0.007770355427505489, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2981132075471698, "acc_stderr,none": 0.028152837942493868}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2947976878612717, "acc_stderr,none": 0.03476599607516478}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.1031390134529148, "acc_stderr,none": 0.020412564289839272}, "mmlu_management": {"alias": " - management", "acc,none": 0.32038834951456313, "acc_stderr,none": 0.0462028408228004}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.21794871794871795, "acc_stderr,none": 0.02704685763071669}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2503192848020434, "acc_stderr,none": 0.015491088951494583}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.02582916327275747}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290406}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.33088235294117646, "acc_stderr,none": 0.028582709753898435}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.03175554786629919}, "mmlu_social_sciences": {"acc,none": 0.28436789080272995, "acc_stderr,none": 0.008106887896598563, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893596}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3434343434343434, "acc_stderr,none": 0.03383201223244442}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.32642487046632124, "acc_stderr,none": 0.033840286211432945}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3487179487179487, "acc_stderr,none": 0.02416278028401772}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.030388353551886845}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.29174311926605506, "acc_stderr,none": 0.019489300968876532}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.0384487613978527}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.016639319350313268}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2530612244897959, "acc_stderr,none": 0.027833023871399683}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208954}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.28385664446558834, "acc_stderr,none": 0.008013803072132026, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.03853254836552003}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952924}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2986111111111111, "acc_stderr,none": 0.038270523579507554}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001975}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.046550104113196156}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421255}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20425531914893616, "acc_stderr,none": 0.026355158413349428}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.037528339580033376}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24338624338624337, "acc_stderr,none": 0.022101128787415426}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.31290322580645163, "acc_stderr,none": 0.026377567028645854}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.0279404571362284}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31125827814569534, "acc_stderr,none": 0.03780445850526732}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3425925925925926, "acc_stderr,none": 0.032365852526021574}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.19642857142857142, "acc_stderr,none": 0.03770970049347019}, "mmlu_pro": {"exact_match,custom-extract": 0.08826462765957446, "exact_match_stderr,custom-extract": 0.0025731336095999877, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.07531380753138076, "exact_match_stderr,custom-extract": 0.009862294734961759}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09885931558935361, "exact_match_stderr,custom-extract": 0.010632661544075495}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.038869257950530034, "exact_match_stderr,custom-extract": 0.005747289272769019}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08780487804878048, "exact_match_stderr,custom-extract": 0.013993989404782777}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11255924170616113, "exact_match_stderr,custom-extract": 0.01088545226217949}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.05675954592363261, "exact_match_stderr,custom-extract": 0.007436917896532645}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13691931540342298, "exact_match_stderr,custom-extract": 0.012026715288380393}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11023622047244094, "exact_match_stderr,custom-extract": 0.016065998434778187}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08446866485013624, "exact_match_stderr,custom-extract": 0.0083847106259262}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0540340488527017, "exact_match_stderr,custom-extract": 0.00615325089732228}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1277056277056277, "exact_match_stderr,custom-extract": 0.010985901551102243}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11222444889779559, "exact_match_stderr,custom-extract": 0.014144273960803923}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08314087759815242, "exact_match_stderr,custom-extract": 0.007663395880276769}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12280701754385964, "exact_match_stderr,custom-extract": 0.01162599162597018}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.198, "acc_stderr,none": 0.017838958963847247, "acc_norm,none": 0.304, "acc_norm_stderr,none": 0.020591649571224932}, "piqa": {"alias": "piqa", "acc,none": 0.6523394994559304, "acc_stderr,none": 0.011111173661393732, "acc_norm,none": 0.6534276387377584, "acc_norm_stderr,none": 0.011103020320872169}, "race": {"alias": "race", "acc,none": 0.3368421052631579, "acc_stderr,none": 0.014627543869045127}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4007164790174002, "acc_stderr,none": 0.011088776944394379}, "winogrande": {"alias": "winogrande", "acc,none": 0.5564325177584846, "acc_stderr,none": 0.013962694907620402}} {"created_at": "2025-04-15T18:30:31.347093", "global_step": 66000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2901023890784983, "acc_stderr,none": 0.013261573677520774, "acc_norm,none": 0.3293515358361775, "acc_norm_stderr,none": 0.013734057652635476}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5896464646464646, "acc_stderr,none": 0.010093531255765458, "acc_norm,none": 0.5446127946127947, "acc_norm_stderr,none": 0.010218861787618721}, "boolq": {"alias": "boolq", "acc,none": 0.6247706422018349, "acc_stderr,none": 0.008468397820914278}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20475020475020475, "acc_stderr,none": 0.011552714477876666}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.047258156262526066}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3668591913961362, "acc_stderr,none": 0.004809626723626822, "acc_norm,none": 0.466938856801434, "acc_norm_stderr,none": 0.004978861409119813}, "mmlu": {"acc,none": 0.2319470160945734, "acc_stderr,none": 0.0035597191609594123, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23528161530286928, "acc_stderr,none": 0.0061842992115013675, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.04073524322147125}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2, "acc_stderr,none": 0.031234752377721175}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.029771775228145638}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.02875679962965834}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302871}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.033220157957767414}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.023357365785874037}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23687150837988827, "acc_stderr,none": 0.014219570788103986}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19292604501607716, "acc_stderr,none": 0.022411516780911363}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.02240967454730419}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23728813559322035, "acc_stderr,none": 0.010865436690780278}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.031581495393387345}, "mmlu_other": {"acc,none": 0.24074670099774703, "acc_stderr,none": 0.007655752130240045, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.02544786382510861}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.13, "acc_stderr,none": 0.03379976689896309}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3094170403587444, "acc_stderr,none": 0.031024411740572192}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.014866821664709604}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.20915032679738563, "acc_stderr,none": 0.02328768531233481}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.02564555362226673}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.24632352941176472, "acc_stderr,none": 0.02617343857052}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.03484331592680589}, "mmlu_social_sciences": {"acc,none": 0.22066948326291844, "acc_stderr,none": 0.007474603163957737, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1919191919191919, "acc_stderr,none": 0.02805779167298902}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.18134715025906736, "acc_stderr,none": 0.027807032360686088}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551975}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.026653531596715487}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21284403669724772, "acc_stderr,none": 0.017549376389313694}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.017479487001364764}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.038313051408846034}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19591836734693877, "acc_stderr,none": 0.025409301953225678}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.03014777593540922}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.22930542340627974, "acc_stderr,none": 0.007486113824170444, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.034554737023254366}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.24342105263157895, "acc_stderr,none": 0.034923496688842384}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.04389869956808779}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.020742740560122666}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1967741935483871, "acc_stderr,none": 0.022616409420742015}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.16748768472906403, "acc_stderr,none": 0.026273086047535414}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.02455617221914124}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.034454062719870546}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25462962962962965, "acc_stderr,none": 0.029711275860005357}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.09109042553191489, "exact_match_stderr,custom-extract": 0.0026147567026628855, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.12412831241283125, "exact_match_stderr,custom-extract": 0.012322509407061304}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09885931558935361, "exact_match_stderr,custom-extract": 0.010632661544075475}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.052120141342756186, "exact_match_stderr,custom-extract": 0.006609188484367754}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0951219512195122, "exact_match_stderr,custom-extract": 0.014506870947377836}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10426540284360189, "exact_match_stderr,custom-extract": 0.010525579113489896}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08152734778121776, "exact_match_stderr,custom-extract": 0.00879522781860576}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1100244498777506, "exact_match_stderr,custom-extract": 0.010947693055603865}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.08661417322834646, "exact_match_stderr,custom-extract": 0.014428786853508842}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08265213442325159, "exact_match_stderr,custom-extract": 0.008302286601704924}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.04885270170244264, "exact_match_stderr,custom-extract": 0.005866800202950917}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12121212121212122, "exact_match_stderr,custom-extract": 0.01074271871039955}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11022044088176353, "exact_match_stderr,custom-extract": 0.014033229017364488}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09006928406466513, "exact_match_stderr,custom-extract": 0.007946120960248703}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12531328320802004, "exact_match_stderr,custom-extract": 0.011727235844461762}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.206, "acc_stderr,none": 0.018104794037333546, "acc_norm,none": 0.302, "acc_norm_stderr,none": 0.020553269174209188}, "piqa": {"alias": "piqa", "acc,none": 0.6430903155603918, "acc_stderr,none": 0.011177909079261191, "acc_norm,none": 0.6485310119695321, "acc_norm_stderr,none": 0.011139207691931191}, "race": {"alias": "race", "acc,none": 0.3253588516746411, "acc_stderr,none": 0.01449998247163688}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40429887410440124, "acc_stderr,none": 0.011104892398300802}, "winogrande": {"alias": "winogrande", "acc,none": 0.5461720599842147, "acc_stderr,none": 0.013992441563707063}} {"created_at": "2025-04-15T20:20:07.704941", "global_step": 72000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.29692832764505117, "acc_stderr,none": 0.013352025976725223, "acc_norm,none": 0.34044368600682595, "acc_norm_stderr,none": 0.013847460518892978}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6195286195286195, "acc_stderr,none": 0.009962305992058572, "acc_norm,none": 0.5968013468013468, "acc_norm_stderr,none": 0.010065668576794798}, "boolq": {"alias": "boolq", "acc,none": 0.5464831804281346, "acc_stderr,none": 0.008707182331111648}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1941031941031941, "acc_stderr,none": 0.011323381588920444}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.046882617226215034}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3736307508464449, "acc_stderr,none": 0.004827786289074849, "acc_norm,none": 0.47629954192391954, "acc_norm_stderr,none": 0.004984172621822885}, "mmlu": {"acc,none": 0.24291411479846176, "acc_stderr,none": 0.0036160639344238288, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2558979808714134, "acc_stderr,none": 0.006360420069777903, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.039325376803928704}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624336}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.029554292605695053}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.30578512396694213, "acc_stderr,none": 0.04205953933884123}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.02361867831006937}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2346368715083799, "acc_stderr,none": 0.014173044098303675}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24115755627009647, "acc_stderr,none": 0.024296594034763426}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.023468429832451166}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2620599739243807, "acc_stderr,none": 0.011231552795890396}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3391812865497076, "acc_stderr,none": 0.03631053496488905}, "mmlu_other": {"acc,none": 0.246218216929514, "acc_stderr,none": 0.0077197045496121735, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106748}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641144}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646036}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674047}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398687}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21241830065359477, "acc_stderr,none": 0.023420375478296125}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25886524822695034, "acc_stderr,none": 0.026129572527180848}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1948529411764706, "acc_stderr,none": 0.02406059942348742}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.2284692882677933, "acc_stderr,none": 0.007565710756563308, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20707070707070707, "acc_stderr,none": 0.02886977846026705}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.029519282616817254}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21025641025641026, "acc_stderr,none": 0.020660597485026924}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275886}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.2018348623853211, "acc_stderr,none": 0.017208579357787565}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.0359546161177469}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.017952449196987866}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.04172343038705383}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20816326530612245, "acc_stderr,none": 0.025991117672813296}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916714}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.23437995559784333, "acc_stderr,none": 0.0075416643630399575, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325435}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.03554180368025689}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.045766654032077636}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292323}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.037245636197746325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.22486772486772486, "acc_stderr,none": 0.021502096078229147}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1935483870967742, "acc_stderr,none": 0.02247525852553606}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22660098522167488, "acc_stderr,none": 0.029454863835292975}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.026335739404055803}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.036313298039696525}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355147}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25, "acc_stderr,none": 0.04109974682633932}, "mmlu_pro": {"exact_match,custom-extract": 0.09034242021276596, "exact_match_stderr,custom-extract": 0.0026038368392467064, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.07252440725244072, "exact_match_stderr,custom-extract": 0.009692522718838486}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10519645120405577, "exact_match_stderr,custom-extract": 0.010929524923270323}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.052120141342756186, "exact_match_stderr,custom-extract": 0.006609188484367765}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09024390243902439, "exact_match_stderr,custom-extract": 0.014168039768581506}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11018957345971564, "exact_match_stderr,custom-extract": 0.010784628980842379}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.054695562435500514, "exact_match_stderr,custom-extract": 0.007308432091333985}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11858190709046455, "exact_match_stderr,custom-extract": 0.01131069177146761}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10236220472440945, "exact_match_stderr,custom-extract": 0.015549935163883116}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07992733878292461, "exact_match_stderr,custom-extract": 0.00817640503900221}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07179866765358993, "exact_match_stderr,custom-extract": 0.007026068864342856}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12337662337662338, "exact_match_stderr,custom-extract": 0.010824855641262714}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.156312625250501, "exact_match_stderr,custom-extract": 0.016273206379169094}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08468052347959969, "exact_match_stderr,custom-extract": 0.007727531295642343}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10902255639097744, "exact_match_stderr,custom-extract": 0.011039829715022964}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.198, "acc_stderr,none": 0.017838958963847244, "acc_norm,none": 0.332, "acc_norm_stderr,none": 0.021081766571222856}, "piqa": {"alias": "piqa", "acc,none": 0.6474428726877041, "acc_stderr,none": 0.011147074365010456, "acc_norm,none": 0.6523394994559304, "acc_norm_stderr,none": 0.01111117366139373}, "race": {"alias": "race", "acc,none": 0.3416267942583732, "acc_stderr,none": 0.014677827770761077}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4155578300921187, "acc_stderr,none": 0.011151553840954863}, "winogrande": {"alias": "winogrande", "acc,none": 0.5832675611681136, "acc_stderr,none": 0.013856250072796315}} {"created_at": "2025-04-15T22:26:21.497683", "global_step": 74000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.30802047781569963, "acc_stderr,none": 0.01349142951729204, "acc_norm,none": 0.3361774744027304, "acc_norm_stderr,none": 0.01380485502620576}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5976430976430976, "acc_stderr,none": 0.01006224471101153, "acc_norm,none": 0.5761784511784511, "acc_norm_stderr,none": 0.010140006095213604}, "boolq": {"alias": "boolq", "acc,none": 0.5474006116207951, "acc_stderr,none": 0.00870566919043118}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18837018837018837, "acc_stderr,none": 0.011194511993535688}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3721370244971121, "acc_stderr,none": 0.0048238677613324675, "acc_norm,none": 0.4731129257120096, "acc_norm_stderr,none": 0.004982561815214122}, "mmlu": {"acc,none": 0.24882495371029767, "acc_stderr,none": 0.0036464606320312596, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24272051009564294, "acc_stderr,none": 0.00625355087650984, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.03970158273235172}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.19393939393939394, "acc_stderr,none": 0.030874145136562097}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591362}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04065578140908705}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507437}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.02353292543104429}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2324022346368715, "acc_stderr,none": 0.014125968754673387}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22508038585209003, "acc_stderr,none": 0.023720088516179034}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23765432098765432, "acc_stderr,none": 0.023683591837008557}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24902216427640156, "acc_stderr,none": 0.01104489226404077}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.03158149539338733}, "mmlu_other": {"acc,none": 0.27550691985838427, "acc_stderr,none": 0.0080044495677572, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2528301886792453, "acc_stderr,none": 0.026749899771241235}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483099}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3452914798206278, "acc_stderr,none": 0.03191100192835794}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.043546310772605956}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.27458492975734355, "acc_stderr,none": 0.01595982993308402}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21895424836601307, "acc_stderr,none": 0.02367908986180772}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.02635806569888059}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2757352941176471, "acc_stderr,none": 0.027146271936625173}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.0355092018568963}, "mmlu_social_sciences": {"acc,none": 0.2378940526486838, "acc_stderr,none": 0.007679804250607694, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.0409698513984367}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.029126522834586825}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.02242127361292371}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.026841514322958945}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22935779816513763, "acc_stderr,none": 0.018025349724618688}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.016819028375736397}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.04309118709946458}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2693877551020408, "acc_stderr,none": 0.02840125202902294}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.031157150869355547}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.24230891214716144, "acc_stderr,none": 0.00763279820449192, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.037498507091740206}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19736842105263158, "acc_stderr,none": 0.03238981601699397}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2152777777777778, "acc_stderr,none": 0.03437079344106134}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.028809989854102956}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2328042328042328, "acc_stderr,none": 0.02176596167215454}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2161290322580645, "acc_stderr,none": 0.02341529343356853}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.23645320197044334, "acc_stderr,none": 0.029896114291733552}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036622}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844065}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.27314814814814814, "acc_stderr,none": 0.03038805130167812}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.0443280405529152}, "mmlu_pro": {"exact_match,custom-extract": 0.08851396276595745, "exact_match_stderr,custom-extract": 0.002579150067883842, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09065550906555091, "exact_match_stderr,custom-extract": 0.010730125693788095}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12167300380228137, "exact_match_stderr,custom-extract": 0.011645606780860747}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.04770318021201413, "exact_match_stderr,custom-extract": 0.0063376540808223565}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08292682926829269, "exact_match_stderr,custom-extract": 0.01363602755824417}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12322274881516587, "exact_match_stderr,custom-extract": 0.011320778562942307}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.06604747162022703, "exact_match_stderr,custom-extract": 0.00798275537524433}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.0941320293398533, "exact_match_stderr,custom-extract": 0.010216217601013364}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12598425196850394, "exact_match_stderr,custom-extract": 0.017022602638569504}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10990009082652134, "exact_match_stderr,custom-extract": 0.009430225142537546}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.050333086602516654, "exact_match_stderr,custom-extract": 0.005950391645720979}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10822510822510822, "exact_match_stderr,custom-extract": 0.010225646711914615}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11823647294589178, "exact_match_stderr,custom-extract": 0.014468953704661763}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.06928406466512702, "exact_match_stderr,custom-extract": 0.00704835724058543}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10651629072681704, "exact_match_stderr,custom-extract": 0.01092753423409934}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.23, "acc_stderr,none": 0.018839050391123147, "acc_norm,none": 0.338, "acc_norm_stderr,none": 0.02117566569520941}, "piqa": {"alias": "piqa", "acc,none": 0.6545157780195865, "acc_stderr,none": 0.011094802893617764, "acc_norm,none": 0.6550598476605005, "acc_norm_stderr,none": 0.011090670102993153}, "race": {"alias": "race", "acc,none": 0.3406698564593301, "acc_stderr,none": 0.014667904380876562}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40992835209826, "acc_stderr,none": 0.0111289766360077}, "winogrande": {"alias": "winogrande", "acc,none": 0.5651144435674822, "acc_stderr,none": 0.013932814110418027}} {"created_at": "2025-04-15T23:58:02.287250", "global_step": 76000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2883959044368601, "acc_stderr,none": 0.013238394422428173, "acc_norm,none": 0.318259385665529, "acc_norm_stderr,none": 0.013611993916971451}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5816498316498316, "acc_stderr,none": 0.010122061470742849, "acc_norm,none": 0.5555555555555556, "acc_norm_stderr,none": 0.01019625483869168}, "boolq": {"alias": "boolq", "acc,none": 0.6061162079510704, "acc_stderr,none": 0.008545835792614982}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091192}, "copa": {"alias": "copa", "acc,none": 0.65, "acc_stderr,none": 0.047937248544110196}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.37263493328022307, "acc_stderr,none": 0.004825179407757559, "acc_norm,none": 0.4720175263891655, "acc_norm_stderr,none": 0.0049819610975908}, "mmlu": {"acc,none": 0.24270047001851588, "acc_stderr,none": 0.0036155697189867056, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24654622741764082, "acc_stderr,none": 0.006286009383567809, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.03932537680392871}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.032876667586034886}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501943}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29535864978902954, "acc_stderr,none": 0.02969633871342289}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.04139112727635464}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.04133119440243838}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123563}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.20257234726688103, "acc_stderr,none": 0.022827317491059686}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2345679012345679, "acc_stderr,none": 0.023576881744005716}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2503259452411995, "acc_stderr,none": 0.011064151027165434}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.033773102522091945}, "mmlu_other": {"acc,none": 0.26295461860315417, "acc_stderr,none": 0.007894488010583433, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.046482319871173156}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.026341480371118362}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.03414014007044036}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.02961432369045665}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23627075351213284, "acc_stderr,none": 0.015190473717037488}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.024170840879341012}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290396}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2757352941176471, "acc_stderr,none": 0.02714627193662517}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3072289156626506, "acc_stderr,none": 0.03591566797824662}, "mmlu_social_sciences": {"acc,none": 0.22489437764055897, "acc_stderr,none": 0.007527646593674297, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20202020202020202, "acc_stderr,none": 0.028606204289229872}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2358974358974359, "acc_stderr,none": 0.021525965407408726}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.02665353159671548}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.2036697247706422, "acc_stderr,none": 0.017266742087630804}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728744}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.017630827375148383}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878284}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.1836734693877551, "acc_stderr,none": 0.024789071332007657}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23383084577114427, "acc_stderr,none": 0.029929415408348387}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.23437995559784333, "acc_stderr,none": 0.007536667115144542, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325435}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.25, "acc_stderr,none": 0.03523807393012047}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03800968060554858}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617748}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.021679219663693145}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.20967741935483872, "acc_stderr,none": 0.023157879349083522}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22660098522167488, "acc_stderr,none": 0.029454863835292982}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.027420019350945284}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.032979866484738336}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1574074074074074, "acc_stderr,none": 0.02483717351824239}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.04157751539865629}, "mmlu_pro": {"exact_match,custom-extract": 0.09092420212765957, "exact_match_stderr,custom-extract": 0.0026160164113003103, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.08089260808926081, "exact_match_stderr,custom-extract": 0.010190160801425412}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10266159695817491, "exact_match_stderr,custom-extract": 0.010812323380686585}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06713780918727916, "exact_match_stderr,custom-extract": 0.0074415092498656505}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.05609756097560976, "exact_match_stderr,custom-extract": 0.011378208553976957}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11611374407582939, "exact_match_stderr,custom-extract": 0.01103382820165053}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07017543859649122, "exact_match_stderr,custom-extract": 0.008210231372398022}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12469437652811736, "exact_match_stderr,custom-extract": 0.01155825482407226}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10236220472440945, "exact_match_stderr,custom-extract": 0.015549935163883116}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09082652134423251, "exact_match_stderr,custom-extract": 0.008664297923859669}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06217616580310881, "exact_match_stderr,custom-extract": 0.006572123520013914}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10281385281385282, "exact_match_stderr,custom-extract": 0.009996920678837277}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1062124248496994, "exact_match_stderr,custom-extract": 0.013806708227626993}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1031562740569669, "exact_match_stderr,custom-extract": 0.008442457140721601}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10401002506265664, "exact_match_stderr,custom-extract": 0.010813343895839517}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.226, "acc_stderr,none": 0.018722956449139922, "acc_norm,none": 0.32, "acc_norm_stderr,none": 0.020882340488761805}, "piqa": {"alias": "piqa", "acc,none": 0.6496191512513602, "acc_stderr,none": 0.011131277554681733, "acc_norm,none": 0.6463547334058759, "acc_norm_stderr,none": 0.011154877708188689}, "race": {"alias": "race", "acc,none": 0.3320574162679426, "acc_stderr,none": 0.014575582129545914}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.3991811668372569, "acc_stderr,none": 0.011081681624606212}, "winogrande": {"alias": "winogrande", "acc,none": 0.5832675611681136, "acc_stderr,none": 0.013856250072796318}} {"created_at": "2025-04-16T05:19:19.284526", "global_step": 82000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2901023890784983, "acc_stderr,none": 0.013261573677520773, "acc_norm,none": 0.3361774744027304, "acc_norm_stderr,none": 0.013804855026205761}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5736531986531986, "acc_stderr,none": 0.010147858603835125, "acc_norm,none": 0.5281986531986532, "acc_norm_stderr,none": 0.010243454104071783}, "boolq": {"alias": "boolq", "acc,none": 0.6314984709480123, "acc_stderr,none": 0.008437199893502969}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19983619983619982, "acc_stderr,none": 0.011448447996728383}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.04725815626252607}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.368352917745469, "acc_stderr,none": 0.004813719952829969, "acc_norm,none": 0.47251543517227645, "acc_norm_stderr,none": 0.004982237133409156}, "mmlu": {"acc,none": 0.23280159521435692, "acc_stderr,none": 0.0035621466031978016, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23485653560042508, "acc_stderr,none": 0.006177566352907708, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604674}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.19393939393939394, "acc_stderr,none": 0.03087414513656209}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.03019028245350194}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.23628691983122363, "acc_stderr,none": 0.02765215314415927}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516302}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.031921934489347215}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.16720257234726688, "acc_stderr,none": 0.02119387252803497}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.20987654320987653, "acc_stderr,none": 0.02265834408598137}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.242503259452412, "acc_stderr,none": 0.010946570966348776}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.2417122626327647, "acc_stderr,none": 0.00765696070887335, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.20754716981132076, "acc_stderr,none": 0.02495991802891127}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.16184971098265896, "acc_stderr,none": 0.028083594279575765}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.32286995515695066, "acc_stderr,none": 0.03138147637575498}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.0376017800602662}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.02999695185834948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24010217113665389, "acc_stderr,none": 0.015274685213734195}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.024288619466046102}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.025892151156709405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.023886881922440335}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553027}, "mmlu_social_sciences": {"acc,none": 0.22814429639259018, "acc_stderr,none": 0.007563953731399623, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.04142439719489362}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.02912652283458682}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803627}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2, "acc_stderr,none": 0.020280805062535722}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.026841514322958945}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1963302752293578, "acc_stderr,none": 0.017030719339154354}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.017848089574913226}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721376}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22857142857142856, "acc_stderr,none": 0.026882144922307744}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_stem": {"acc,none": 0.22549952426260705, "acc_stderr,none": 0.007434444240255636, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.17777777777777778, "acc_stderr,none": 0.033027898599017176}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.032790004063100515}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.035146974678623884}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.03873958714149352}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292323}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.021935878081184763}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2064516129032258, "acc_stderr,none": 0.023025899617188733}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.18226600985221675, "acc_stderr,none": 0.02716334085964515}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.02578787422095931}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18055555555555555, "acc_stderr,none": 0.02623287897149166}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.09881981382978723, "exact_match_stderr,custom-extract": 0.0027134143339787044, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.14086471408647142, "exact_match_stderr,custom-extract": 0.013000958624507538}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11533586818757921, "exact_match_stderr,custom-extract": 0.011379109995317205}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.060954063604240286, "exact_match_stderr,custom-extract": 0.007113993242445064}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12195121951219512, "exact_match_stderr,custom-extract": 0.0161804554422017}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12203791469194313, "exact_match_stderr,custom-extract": 0.011273830017419379}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09391124871001032, "exact_match_stderr,custom-extract": 0.009375760359013236}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1136919315403423, "exact_match_stderr,custom-extract": 0.011105705318793015}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10761154855643044, "exact_match_stderr,custom-extract": 0.015896979452723375}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.0971843778383288, "exact_match_stderr,custom-extract": 0.008931027353227365}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06365655070318282, "exact_match_stderr,custom-extract": 0.006644652222081494}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10930735930735931, "exact_match_stderr,custom-extract": 0.010270410036987683}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.08817635270541083, "exact_match_stderr,custom-extract": 0.012706233135747353}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08391070053887606, "exact_match_stderr,custom-extract": 0.0076955600667595065}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12907268170426064, "exact_match_stderr,custom-extract": 0.01187623992295407}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.21, "acc_stderr,none": 0.018233620865305916, "acc_norm,none": 0.328, "acc_norm_stderr,none": 0.021017027165175485}, "piqa": {"alias": "piqa", "acc,none": 0.6414581066376496, "acc_stderr,none": 0.011189212572356364, "acc_norm,none": 0.6420021762785637, "acc_norm_stderr,none": 0.011185460416617296}, "race": {"alias": "race", "acc,none": 0.35023923444976074, "acc_stderr,none": 0.014764164998575968}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4017400204708291, "acc_stderr,none": 0.011093444192711178}, "winogrande": {"alias": "winogrande", "acc,none": 0.5611681136543015, "acc_stderr,none": 0.013946933444507034}} {"created_at": "2025-04-16T09:15:16.015593", "global_step": 86000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.28754266211604096, "acc_stderr,none": 0.013226719056266132, "acc_norm,none": 0.3250853242320819, "acc_norm_stderr,none": 0.013688147309729122}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5963804713804713, "acc_stderr,none": 0.010067368960348226, "acc_norm,none": 0.5643939393939394, "acc_norm_stderr,none": 0.010174341733665219}, "boolq": {"alias": "boolq", "acc,none": 0.41559633027522935, "acc_stderr,none": 0.008619555273337564}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2194922194922195, "acc_stderr,none": 0.011849997754533952}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.37641904003186616, "acc_stderr,none": 0.004834969412883646, "acc_norm,none": 0.48645688109938257, "acc_norm_stderr,none": 0.004987950663406554}, "mmlu": {"acc,none": 0.2310212220481413, "acc_stderr,none": 0.0035500822356133864, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2450584484590861, "acc_stderr,none": 0.006267591586997277, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848878}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.031922715695482995}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604246}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.029178682304842555}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.023357365785874037}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2191358024691358, "acc_stderr,none": 0.023016705640262196}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2516297262059974, "acc_stderr,none": 0.011083276280441907}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.32748538011695905, "acc_stderr,none": 0.035993357714560276}, "mmlu_other": {"acc,none": 0.2413904087544255, "acc_stderr,none": 0.00765278466714354, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21132075471698114, "acc_stderr,none": 0.025125766484827845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.03063114553919882}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.14, "acc_stderr,none": 0.03487350880197771}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.02999695185834948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24010217113665389, "acc_stderr,none": 0.015274685213734195}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.0239291555173513}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729903}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.023886881922440338}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.21871953201169972, "acc_stderr,none": 0.007452042044982568, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.038351539543994194}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1717171717171717, "acc_stderr,none": 0.026869716187429914}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551975}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1981651376146789, "acc_stderr,none": 0.017090573804217885}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.01755581809132227}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20816326530612245, "acc_stderr,none": 0.025991117672813296}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.030360490154014666}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_stem": {"acc,none": 0.2118617189977799, "acc_stderr,none": 0.007261460373789619, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.17777777777777778, "acc_stderr,none": 0.033027898599017176}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.0315469804508223}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.02094048156533485}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18064516129032257, "acc_stderr,none": 0.021886178567172548}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15763546798029557, "acc_stderr,none": 0.025639014131172404}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.14814814814814814, "acc_stderr,none": 0.02422762927372836}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.044642857142857116}, "mmlu_pro": {"exact_match,custom-extract": 0.08884640957446809, "exact_match_stderr,custom-extract": 0.0025882307136233483, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.08507670850767085, "exact_match_stderr,custom-extract": 0.010426562968267427}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09885931558935361, "exact_match_stderr,custom-extract": 0.010632661544075505}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.054770318021201414, "exact_match_stderr,custom-extract": 0.0067656574329187915}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0975609756097561, "exact_match_stderr,custom-extract": 0.014671865834334242}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10308056872037914, "exact_match_stderr,custom-extract": 0.01047252322376017}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0784313725490196, "exact_match_stderr,custom-extract": 0.008641140565804537}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12102689486552567, "exact_match_stderr,custom-extract": 0.011410842488489002}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11023622047244094, "exact_match_stderr,custom-extract": 0.016065998434778184}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09264305177111716, "exact_match_stderr,custom-extract": 0.008741765825863196}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05477424130273871, "exact_match_stderr,custom-extract": 0.006192828815262607}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11796536796536797, "exact_match_stderr,custom-extract": 0.010617425726799192}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.09619238476953908, "exact_match_stderr,custom-extract": 0.013212763839442681}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08314087759815242, "exact_match_stderr,custom-extract": 0.007663395880276773}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10401002506265664, "exact_match_stderr,custom-extract": 0.010813343895839498}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.222, "acc_stderr,none": 0.01860441475825008, "acc_norm,none": 0.326, "acc_norm_stderr,none": 0.020984009562393567}, "piqa": {"alias": "piqa", "acc,none": 0.6458106637649619, "acc_stderr,none": 0.011158755672626114, "acc_norm,none": 0.6561479869423286, "acc_norm_stderr,none": 0.011082356277961393}, "race": {"alias": "race", "acc,none": 0.34545454545454546, "acc_stderr,none": 0.014716858425461336}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4032753326509724, "acc_stderr,none": 0.011100350776719526}, "winogrande": {"alias": "winogrande", "acc,none": 0.585635359116022, "acc_stderr,none": 0.01384484623226856}} {"created_at": "2025-04-16T10:55:08.610728", "global_step": 88000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2883959044368601, "acc_stderr,none": 0.01323839442242817, "acc_norm,none": 0.33532423208191126, "acc_norm_stderr,none": 0.013796182947785562}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6060606060606061, "acc_stderr,none": 0.010026305355981814, "acc_norm,none": 0.5753367003367004, "acc_norm_stderr,none": 0.010142653687480411}, "boolq": {"alias": "boolq", "acc,none": 0.5703363914373089, "acc_stderr,none": 0.00865809540849789}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20556920556920558, "acc_stderr,none": 0.011569834551534297}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.046882617226215034}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.37621987651862177, "acc_stderr,none": 0.004834461997944859, "acc_norm,none": 0.48078072097191793, "acc_norm_stderr,none": 0.004986093791041651}, "mmlu": {"acc,none": 0.24939467312348668, "acc_stderr,none": 0.0036465438274978926, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2499468650371945, "acc_stderr,none": 0.006312326575779651, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139406}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501936}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.33884297520661155, "acc_stderr,none": 0.043207678075366705}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.042365112580946315}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.17791411042944785, "acc_stderr,none": 0.03004735765580665}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2572347266881029, "acc_stderr,none": 0.024826171289250888}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.024922001168886335}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2516297262059974, "acc_stderr,none": 0.011083276280441902}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.26900584795321636, "acc_stderr,none": 0.034010526201040885}, "mmlu_other": {"acc,none": 0.2603797875764403, "acc_stderr,none": 0.007863838705603325, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.02590789712240817}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.19653179190751446, "acc_stderr,none": 0.03029957466478814}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.33183856502242154, "acc_stderr,none": 0.03160295143776679}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.04498676320572922}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.02920254015343119}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26436781609195403, "acc_stderr,none": 0.015769984840690515}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.024518195641879334}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.026011992930902006}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23161764705882354, "acc_stderr,none": 0.025626533803777562}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.23493975903614459, "acc_stderr,none": 0.03300533186128922}, "mmlu_social_sciences": {"acc,none": 0.2528436789080273, "acc_stderr,none": 0.007834746296277726, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.031156269519646847}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.03027690994517824}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2230769230769231, "acc_stderr,none": 0.02110773012724399}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.028657491285071973}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27155963302752295, "acc_stderr,none": 0.019069098363191452}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596919}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.017848089574913226}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.33636363636363636, "acc_stderr,none": 0.04525393596302505}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20816326530612245, "acc_stderr,none": 0.025991117672813292}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_stem": {"acc,none": 0.23437995559784333, "acc_stderr,none": 0.007516517510202672, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.035914440841969694}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23026315789473684, "acc_stderr,none": 0.034260594244031654}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2152777777777778, "acc_stderr,none": 0.03437079344106135}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.15, "acc_stderr,none": 0.035887028128263714}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.15, "acc_stderr,none": 0.035887028128263734}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.03793281185307808}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3276595744680851, "acc_stderr,none": 0.030683020843231}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.021132859182754447}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24838709677419354, "acc_stderr,none": 0.02458002892148101}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22167487684729065, "acc_stderr,none": 0.029225575892489593}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.026202766534652148}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2119205298013245, "acc_stderr,none": 0.03336767086567977}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1574074074074074, "acc_stderr,none": 0.024837173518242384}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.09998337765957446, "exact_match_stderr,custom-extract": 0.0027255524899319826, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09902370990237098, "exact_match_stderr,custom-extract": 0.011162713195868411}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08998732572877059, "exact_match_stderr,custom-extract": 0.010194156217460263}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05830388692579505, "exact_match_stderr,custom-extract": 0.006967433636014822}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07560975609756097, "exact_match_stderr,custom-extract": 0.013072388347810092}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.13981042654028436, "exact_match_stderr,custom-extract": 0.011944090354236481}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07223942208462332, "exact_match_stderr,custom-extract": 0.008320844580110067}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12469437652811736, "exact_match_stderr,custom-extract": 0.011558254824072248}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14698162729658792, "exact_match_stderr,custom-extract": 0.018164310621441047}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1008174386920981, "exact_match_stderr,custom-extract": 0.009078109672456584}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0695780903034789, "exact_match_stderr,custom-extract": 0.006924833446490222}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1277056277056277, "exact_match_stderr,custom-extract": 0.010985901551102245}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12625250501002003, "exact_match_stderr,custom-extract": 0.01488326800954695}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10161662817551963, "exact_match_stderr,custom-extract": 0.008386406247532766}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12531328320802004, "exact_match_stderr,custom-extract": 0.01172723584446176}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.21, "acc_stderr,none": 0.018233620865305913, "acc_norm,none": 0.322, "acc_norm_stderr,none": 0.020916668330019882}, "piqa": {"alias": "piqa", "acc,none": 0.6626768226332971, "acc_stderr,none": 0.011031114785059705, "acc_norm,none": 0.6605005440696409, "acc_norm_stderr,none": 0.011048455047173916}, "race": {"alias": "race", "acc,none": 0.3311004784688995, "acc_stderr,none": 0.014564986871061022}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4094165813715456, "acc_stderr,none": 0.011126849576589028}, "winogrande": {"alias": "winogrande", "acc,none": 0.585635359116022, "acc_stderr,none": 0.013844846232268561}} {"created_at": "2025-04-16T12:40:50.417481", "global_step": 90000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3046075085324232, "acc_stderr,none": 0.013449522109932494, "acc_norm,none": 0.3412969283276451, "acc_norm_stderr,none": 0.013855831287497731}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5984848484848485, "acc_stderr,none": 0.010058790020755569, "acc_norm,none": 0.561026936026936, "acc_norm_stderr,none": 0.010183076012972064}, "boolq": {"alias": "boolq", "acc,none": 0.627217125382263, "acc_stderr,none": 0.008457255867914694}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202903}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.04725815626252607}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.37751443935471024, "acc_stderr,none": 0.004837744647345723, "acc_norm,none": 0.4812786297550289, "acc_norm_stderr,none": 0.004986282450647317}, "mmlu": {"acc,none": 0.24462327303802878, "acc_stderr,none": 0.003627025241486734, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24187035069075452, "acc_stderr,none": 0.006237917910128818, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30952380952380953, "acc_stderr,none": 0.04134913018303316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.296969696969697, "acc_stderr,none": 0.035679697722680474}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460295}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.04026187527591204}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.044531975073749834}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.1901840490797546, "acc_stderr,none": 0.030833491146281228}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123563}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.22905027932960895, "acc_stderr,none": 0.014054314935614556}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18971061093247588, "acc_stderr,none": 0.02226819625878321}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.022409674547304193}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2522816166883963, "acc_stderr,none": 0.01109278905687524}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2573099415204678, "acc_stderr,none": 0.03352799844161865}, "mmlu_other": {"acc,none": 0.2442870936594786, "acc_stderr,none": 0.0077019669886537025, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641144}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3004484304932735, "acc_stderr,none": 0.030769352008229132}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646035}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02987257770889117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.227330779054917, "acc_stderr,none": 0.01498727064094601}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.0242886194660461}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.02525786135943241}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2536764705882353, "acc_stderr,none": 0.02643132987078953}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233134}, "mmlu_social_sciences": {"acc,none": 0.24406889827754305, "acc_stderr,none": 0.007745012505773717, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23232323232323232, "acc_stderr,none": 0.030088629490217483}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23076923076923078, "acc_stderr,none": 0.021362027725222717}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279483}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26055045871559634, "acc_stderr,none": 0.018819182034850068}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.017630827375148383}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24897959183673468, "acc_stderr,none": 0.027682979522960227}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.21890547263681592, "acc_stderr,none": 0.029239174636647}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_stem": {"acc,none": 0.2496035521725341, "acc_stderr,none": 0.007716761945400281, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.035914440841969694}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.24342105263157895, "acc_stderr,none": 0.034923496688842384}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.03852084696008534}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993176}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2851063829787234, "acc_stderr,none": 0.029513196625539355}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.022261817692400175}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23548387096774193, "acc_stderr,none": 0.024137632429337703}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.21674876847290642, "acc_stderr,none": 0.028990331252516235}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.02606715922227579}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987054}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.02915752218460559}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "mmlu_pro": {"exact_match,custom-extract": 0.08244680851063829, "exact_match_stderr,custom-extract": 0.002500101617468454, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.06834030683403068, "exact_match_stderr,custom-extract": 0.009429976369208433}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08871989860583017, "exact_match_stderr,custom-extract": 0.010129158179627848}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.038869257950530034, "exact_match_stderr,custom-extract": 0.005747289272769008}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07560975609756097, "exact_match_stderr,custom-extract": 0.013072388347810092}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11966824644549763, "exact_match_stderr,custom-extract": 0.011178894558789447}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0629514963880289, "exact_match_stderr,custom-extract": 0.007806320646481141}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11246943765281174, "exact_match_stderr,custom-extract": 0.011053451044082589}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10498687664041995, "exact_match_stderr,custom-extract": 0.015724991203554563}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07356948228882834, "exact_match_stderr,custom-extract": 0.007871525990705145}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07475943745373798, "exact_match_stderr,custom-extract": 0.0071580291082857656}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09307359307359307, "exact_match_stderr,custom-extract": 0.009563093747403618}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13026052104208416, "exact_match_stderr,custom-extract": 0.015082951205521085}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07467282525019246, "exact_match_stderr,custom-extract": 0.0072961138747172395}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.09273182957393483, "exact_match_stderr,custom-extract": 0.010274320069747482}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.212, "acc_stderr,none": 0.018297037004013885, "acc_norm,none": 0.338, "acc_norm_stderr,none": 0.02117566569520941}, "piqa": {"alias": "piqa", "acc,none": 0.6610446137105549, "acc_stderr,none": 0.011044144419710633, "acc_norm,none": 0.6664853101196954, "acc_norm_stderr,none": 0.011000139592184573}, "race": {"alias": "race", "acc,none": 0.3464114832535885, "acc_stderr,none": 0.014726451021782801}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.38843398157625386, "acc_stderr,none": 0.011028822814998104}, "winogrande": {"alias": "winogrande", "acc,none": 0.5674822415153907, "acc_stderr,none": 0.013923911578623842}} {"created_at": "2025-04-16T14:34:52.115348", "global_step": 92000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.26621160409556316, "acc_stderr,none": 0.012915774781523216, "acc_norm,none": 0.33532423208191126, "acc_norm_stderr,none": 0.013796182947785568}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6069023569023569, "acc_stderr,none": 0.010022540618945334, "acc_norm,none": 0.5488215488215489, "acc_norm_stderr,none": 0.010210757101073472}, "boolq": {"alias": "boolq", "acc,none": 0.5256880733944954, "acc_stderr,none": 0.00873350602718366}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21048321048321048, "acc_stderr,none": 0.011671038436522905}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.046882617226215034}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3798048197570205, "acc_stderr,none": 0.004843462545943499, "acc_norm,none": 0.48466440948018324, "acc_norm_stderr,none": 0.004987433862274565}, "mmlu": {"acc,none": 0.2310212220481413, "acc_stderr,none": 0.0035511333673358177, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2459086078639745, "acc_stderr,none": 0.0062753722695856875, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848878}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.03077855467869326}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.02289916291844581}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25554106910039115, "acc_stderr,none": 0.011139857833598514}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.24074670099774703, "acc_stderr,none": 0.007653003838458622, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.03063114553919882}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24010217113665389, "acc_stderr,none": 0.015274685213734195}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888135}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.025518731049537766}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.21871953201169972, "acc_stderr,none": 0.007447192809441741, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.027479603010538804}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.02047323317355198}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1889908256880734, "acc_stderr,none": 0.01678548115920363}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.038808483010823944}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.017593486895366835}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.21122740247383445, "acc_stderr,none": 0.007261185248584388, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325435}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17763157894736842, "acc_stderr,none": 0.031103182383123398}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.02094048156533485}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.02203721734026784}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15270935960591134, "acc_stderr,none": 0.025308904539380627}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.024720713193952134}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473836}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.14814814814814814, "acc_stderr,none": 0.02422762927372836}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.04157751539865629}, "mmlu_pro": {"exact_match,custom-extract": 0.09217087765957446, "exact_match_stderr,custom-extract": 0.0026305353871765884, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.08786610878661087, "exact_match_stderr,custom-extract": 0.010579946755187827}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09125475285171103, "exact_match_stderr,custom-extract": 0.010258543729935024}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.04770318021201413, "exact_match_stderr,custom-extract": 0.006337654080822372}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07804878048780488, "exact_match_stderr,custom-extract": 0.013264026422963555}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12559241706161137, "exact_match_stderr,custom-extract": 0.011413658642349226}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0846233230134159, "exact_match_stderr,custom-extract": 0.008945554797547956}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10024449877750612, "exact_match_stderr,custom-extract": 0.010507066995360125}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09448818897637795, "exact_match_stderr,custom-extract": 0.0150052772401423}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08265213442325159, "exact_match_stderr,custom-extract": 0.008302286601704931}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07031828275351591, "exact_match_stderr,custom-extract": 0.006958800549270504}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11038961038961038, "exact_match_stderr,custom-extract": 0.010314856083401153}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1342685370741483, "exact_match_stderr,custom-extract": 0.015277913884522445}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09853733641262509, "exact_match_stderr,custom-extract": 0.008272503032902286}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12406015037593984, "exact_match_stderr,custom-extract": 0.011676807835852744}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.226, "acc_stderr,none": 0.018722956449139922, "acc_norm,none": 0.34, "acc_norm_stderr,none": 0.021206117013673066}, "piqa": {"alias": "piqa", "acc,none": 0.6490750816104461, "acc_stderr,none": 0.011135250564776789, "acc_norm,none": 0.6550598476605005, "acc_norm_stderr,none": 0.011090670102993153}, "race": {"alias": "race", "acc,none": 0.3444976076555024, "acc_stderr,none": 0.014707199932728217}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4022517911975435, "acc_stderr,none": 0.011095758951308076}, "winogrande": {"alias": "winogrande", "acc,none": 0.5761641673243884, "acc_stderr,none": 0.013888492389944518}} {"created_at": "2025-04-16T16:22:50.291973", "global_step": 94000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.29692832764505117, "acc_stderr,none": 0.013352025976725222, "acc_norm,none": 0.3310580204778157, "acc_norm_stderr,none": 0.013752062419817837}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5867003367003367, "acc_stderr,none": 0.010104361780747513, "acc_norm,none": 0.5593434343434344, "acc_norm_stderr,none": 0.010187264635711981}, "boolq": {"alias": "boolq", "acc,none": 0.40886850152905196, "acc_stderr,none": 0.008598573693259096}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19164619164619165, "acc_stderr,none": 0.01126862497880164}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38169687313284206, "acc_stderr,none": 0.0048480996616197, "acc_norm,none": 0.4914359689304919, "acc_norm_stderr,none": 0.0049890494303912875}, "mmlu": {"acc,none": 0.2547357926221336, "acc_stderr,none": 0.0036707941042800384, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23868225292242295, "acc_stderr,none": 0.006214864105695403, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3412698412698413, "acc_stderr,none": 0.04240799327574925}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.02933116229425172}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2320675105485232, "acc_stderr,none": 0.02747974455080851}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.03520893951097653}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252626}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.22832369942196531, "acc_stderr,none": 0.022598703804321614}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24022346368715083, "acc_stderr,none": 0.014288343803925307}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2379421221864952, "acc_stderr,none": 0.024185150647818704}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22839506172839505, "acc_stderr,none": 0.023358211840626267}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2046783625730994, "acc_stderr,none": 0.030944459778533204}, "mmlu_other": {"acc,none": 0.24879304795622786, "acc_stderr,none": 0.007743618743955753, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.16, "acc_stderr,none": 0.036845294917747094}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.02634148037111836}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641144}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.19282511210762332, "acc_stderr,none": 0.02647824096048936}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.045416094465039476}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.029614323690456648}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.227330779054917, "acc_stderr,none": 0.01498727064094602}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.30718954248366015, "acc_stderr,none": 0.026415601914389002}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2907801418439716, "acc_stderr,none": 0.027090664368353178}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.026799562024887678}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2289156626506024, "acc_stderr,none": 0.03270745277352477}, "mmlu_social_sciences": {"acc,none": 0.27786805329866754, "acc_stderr,none": 0.008048971244659723, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.03646758875075566}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2676767676767677, "acc_stderr,none": 0.03154449888270286}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27461139896373055, "acc_stderr,none": 0.03221024508041156}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3282051282051282, "acc_stderr,none": 0.023807633198657266}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.029344572500634342}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.29541284403669726, "acc_stderr,none": 0.019560619182976}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.01699272346546625}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721376}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.37551020408163266, "acc_stderr,none": 0.03100120903989484}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_stem": {"acc,none": 0.26197272438947034, "acc_stderr,none": 0.007831341951374497, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.03502553170678316}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.03761070869867479}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.04576665403207762}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.22127659574468084, "acc_stderr,none": 0.027136349602424052}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727771}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.022261817692400168}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27741935483870966, "acc_stderr,none": 0.025470196835900055}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.24630541871921183, "acc_stderr,none": 0.030315099285617732}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.027940457136228405}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.03710185726119995}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.24537037037037038, "acc_stderr,none": 0.029346665094372944}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.03952301967702511}, "mmlu_pro": {"exact_match,custom-extract": 0.09815492021276596, "exact_match_stderr,custom-extract": 0.002709321532083858, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.10599721059972106, "exact_match_stderr,custom-extract": 0.011504298561744808}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11153358681875793, "exact_match_stderr,custom-extract": 0.011213991771867689}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.08745583038869258, "exact_match_stderr,custom-extract": 0.008400207784455759}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07560975609756097, "exact_match_stderr,custom-extract": 0.013072388347810092}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10900473933649289, "exact_match_stderr,custom-extract": 0.010733629491578632}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08771929824561403, "exact_match_stderr,custom-extract": 0.009092308015144608}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12469437652811736, "exact_match_stderr,custom-extract": 0.011558254824072245}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.07349081364829396, "exact_match_stderr,custom-extract": 0.013385962202955822}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09627611262488647, "exact_match_stderr,custom-extract": 0.008893665915732372}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07401924500370097, "exact_match_stderr,custom-extract": 0.007125353603845778}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11580086580086581, "exact_match_stderr,custom-extract": 0.01053246671607756}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.0781563126252505, "exact_match_stderr,custom-extract": 0.012028071535520735}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09545804464973057, "exact_match_stderr,custom-extract": 0.008156113834567354}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13032581453634084, "exact_match_stderr,custom-extract": 0.01192516379322867}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.218, "acc_stderr,none": 0.018483378223178856, "acc_norm,none": 0.328, "acc_norm_stderr,none": 0.02101702716517549}, "piqa": {"alias": "piqa", "acc,none": 0.6479869423286181, "acc_stderr,none": 0.01114314895306609, "acc_norm,none": 0.6490750816104461, "acc_norm_stderr,none": 0.011135250564776787}, "race": {"alias": "race", "acc,none": 0.33779904306220093, "acc_stderr,none": 0.014637734314782855}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41914022517911975, "acc_stderr,none": 0.01116514070817033}, "winogrande": {"alias": "winogrande", "acc,none": 0.5880031570639306, "acc_stderr,none": 0.013833112857645937}} {"created_at": "2025-04-16T18:14:47.022245", "global_step": 96000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.29692832764505117, "acc_stderr,none": 0.013352025976725222, "acc_norm,none": 0.3267918088737201, "acc_norm_stderr,none": 0.013706665975587333}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5980639730639731, "acc_stderr,none": 0.010060521220920566, "acc_norm,none": 0.5749158249158249, "acc_norm_stderr,none": 0.01014396619571784}, "boolq": {"alias": "boolq", "acc,none": 0.6256880733944954, "acc_stderr,none": 0.008464246656443236}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19328419328419327, "acc_stderr,none": 0.01130520748682769}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3737303326030671, "acc_stderr,none": 0.004828045774734895, "acc_norm,none": 0.48307110137422826, "acc_norm_stderr,none": 0.00498692057228442}, "mmlu": {"acc,none": 0.2548070075487822, "acc_stderr,none": 0.0036730405567748747, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.271413390010627, "acc_stderr,none": 0.006482088447957408, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624335}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.0309645179269234}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658335}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.38016528925619836, "acc_stderr,none": 0.04431324501968432}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.041331194402438376}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.03602511318806771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.26878612716763006, "acc_stderr,none": 0.02386800326250011}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.27009646302250806, "acc_stderr,none": 0.02521804037341062}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2932098765432099, "acc_stderr,none": 0.025329888171900922}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2737940026075619, "acc_stderr,none": 0.01138861216797938}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.034886477134579215}, "mmlu_other": {"acc,none": 0.23688445445767622, "acc_stderr,none": 0.007612770554091302, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2037735849056604, "acc_stderr,none": 0.024790784501775406}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.19730941704035873, "acc_stderr,none": 0.02670985334496796}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23931623931623933, "acc_stderr,none": 0.027951826808924333}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26947637292464877, "acc_stderr,none": 0.015866243073215047}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.024739981355113596}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880592}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17279411764705882, "acc_stderr,none": 0.02296606758558176}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.19879518072289157, "acc_stderr,none": 0.03106939026078941}, "mmlu_social_sciences": {"acc,none": 0.24244393890152746, "acc_stderr,none": 0.007727396433170278, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.037752050135836386}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2676767676767677, "acc_stderr,none": 0.03154449888270285}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.030276909945178267}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.026841514322958927}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23119266055045873, "acc_stderr,none": 0.018075750241633156}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.037276735755969195}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.017986615304030323}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884601}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2816326530612245, "acc_stderr,none": 0.02879518557429129}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.03096590312357303}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909281}, "mmlu_stem": {"acc,none": 0.2597526165556613, "acc_stderr,none": 0.007802212549228165, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3037037037037037, "acc_stderr,none": 0.039725528847851375}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137283}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653696}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2170212765957447, "acc_stderr,none": 0.026947483121496228}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.037800192304380135}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.26129032258064516, "acc_stderr,none": 0.024993053397764822}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3054187192118227, "acc_stderr,none": 0.03240661565868408}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.02708037281514566}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.026991454502036726}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.19642857142857142, "acc_stderr,none": 0.03770970049347019}, "mmlu_pro": {"exact_match,custom-extract": 0.09059175531914894, "exact_match_stderr,custom-extract": 0.0026070828130951803, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.08228730822873083, "exact_match_stderr,custom-extract": 0.010269830737183232}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.07731305449936629, "exact_match_stderr,custom-extract": 0.009514599128594299}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0636042402826855, "exact_match_stderr,custom-extract": 0.007256738135342953}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07804878048780488, "exact_match_stderr,custom-extract": 0.013264026422963562}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10071090047393365, "exact_match_stderr,custom-extract": 0.010365114807941393}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.06501547987616099, "exact_match_stderr,custom-extract": 0.007924519124852931}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13080684596577016, "exact_match_stderr,custom-extract": 0.011796749495986738}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.08398950131233596, "exact_match_stderr,custom-extract": 0.014228886942972686}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08810172570390554, "exact_match_stderr,custom-extract": 0.008546121482440766}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06365655070318282, "exact_match_stderr,custom-extract": 0.006644652222081482}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13528138528138528, "exact_match_stderr,custom-extract": 0.011257853023057847}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1342685370741483, "exact_match_stderr,custom-extract": 0.015277913884522465}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07698229407236336, "exact_match_stderr,custom-extract": 0.007398830773812016}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13032581453634084, "exact_match_stderr,custom-extract": 0.011925163793228668}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.236, "acc_stderr,none": 0.019008699622084728, "acc_norm,none": 0.346, "acc_norm_stderr,none": 0.021294951277234637}, "piqa": {"alias": "piqa", "acc,none": 0.6648531011969532, "acc_stderr,none": 0.01101351312864394, "acc_norm,none": 0.6621327529923831, "acc_norm_stderr,none": 0.011035474307853848}, "race": {"alias": "race", "acc,none": 0.35023923444976074, "acc_stderr,none": 0.014764164998575968}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4053224155578301, "acc_stderr,none": 0.011109383877623348}, "winogrande": {"alias": "winogrande", "acc,none": 0.5935280189423836, "acc_stderr,none": 0.013804448697753376}} {"created_at": "2025-04-16T20:02:52.858235", "global_step": 98000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.28668941979522183, "acc_stderr,none": 0.013214986329274776, "acc_norm,none": 0.3165529010238908, "acc_norm_stderr,none": 0.013592431519068079}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6001683501683501, "acc_stderr,none": 0.010051788039412915, "acc_norm,none": 0.57996632996633, "acc_norm_stderr,none": 0.010127718838529321}, "boolq": {"alias": "boolq", "acc,none": 0.6110091743119266, "acc_stderr,none": 0.008526800159503198}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20393120393120392, "acc_stderr,none": 0.011535521334313655}, "copa": {"alias": "copa", "acc,none": 0.66, "acc_stderr,none": 0.04760952285695238}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3781119298944433, "acc_stderr,none": 0.004839247332606038, "acc_norm,none": 0.4854610635331607, "acc_norm_stderr,none": 0.0049876714786409575}, "mmlu": {"acc,none": 0.23095000712149266, "acc_stderr,none": 0.0035506164406411437, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2456960680127524, "acc_stderr,none": 0.006272804287554439, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848878}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615767}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.023176298203992012}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18006430868167203, "acc_stderr,none": 0.02182342285774494}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.02289916291844581}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.258148631029987, "acc_stderr,none": 0.0111769237193134}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.23978113936272932, "acc_stderr,none": 0.007643286433260536, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23754789272030652, "acc_stderr,none": 0.015218733046150193}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888135}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23049645390070922, "acc_stderr,none": 0.025123739226872405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.21774455638609036, "acc_stderr,none": 0.007436791962116233, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2, "acc_stderr,none": 0.020280805062535722}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936094}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.01755581809132227}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.02520696315422542}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.2131303520456708, "acc_stderr,none": 0.007281222067697733, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17763157894736842, "acc_stderr,none": 0.031103182383123398}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21164021164021163, "acc_stderr,none": 0.02103733150526289}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18064516129032257, "acc_stderr,none": 0.021886178567172548}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1477832512315271, "acc_stderr,none": 0.024969621333521274}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21851851851851853, "acc_stderr,none": 0.025195752251823793}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.18543046357615894, "acc_stderr,none": 0.031732843842942865}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.025416428388767478}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.09757313829787234, "exact_match_stderr,custom-extract": 0.0026906247408547486, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.15202231520223153, "exact_match_stderr,custom-extract": 0.013418048947063343}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11026615969581749, "exact_match_stderr,custom-extract": 0.011158044019782562}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.052120141342756186, "exact_match_stderr,custom-extract": 0.006609188484367764}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1024390243902439, "exact_match_stderr,custom-extract": 0.01499350068423847}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1504739336492891, "exact_match_stderr,custom-extract": 0.01231417168822253}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07017543859649122, "exact_match_stderr,custom-extract": 0.008210231372398031}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1039119804400978, "exact_match_stderr,custom-extract": 0.01067571861205893}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09711286089238845, "exact_match_stderr,custom-extract": 0.015190193611399451}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09809264305177112, "exact_match_stderr,custom-extract": 0.008968149521850266}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06291635825314582, "exact_match_stderr,custom-extract": 0.006608518078813468}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11038961038961038, "exact_match_stderr,custom-extract": 0.010314856083401151}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13827655310621242, "exact_match_stderr,custom-extract": 0.015468334539576873}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.06774441878367975, "exact_match_stderr,custom-extract": 0.006975364557816214}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13533834586466165, "exact_match_stderr,custom-extract": 0.012117258449225144}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.218, "acc_stderr,none": 0.018483378223178863, "acc_norm,none": 0.34, "acc_norm_stderr,none": 0.021206117013673066}, "piqa": {"alias": "piqa", "acc,none": 0.6556039173014145, "acc_stderr,none": 0.011086521237125621, "acc_norm,none": 0.6643090315560392, "acc_norm_stderr,none": 0.011017938116656325}, "race": {"alias": "race", "acc,none": 0.3320574162679426, "acc_stderr,none": 0.014575582129545914}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4017400204708291, "acc_stderr,none": 0.011093444192711178}, "winogrande": {"alias": "winogrande", "acc,none": 0.5603788476716653, "acc_stderr,none": 0.013949649776015701}} {"created_at": "2025-04-16T22:50:30.230459", "global_step": 100000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2883959044368601, "acc_stderr,none": 0.013238394422428164, "acc_norm,none": 0.318259385665529, "acc_norm_stderr,none": 0.013611993916971451}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5871212121212122, "acc_stderr,none": 0.010102837421104668, "acc_norm,none": 0.5618686868686869, "acc_norm_stderr,none": 0.010180937100600064}, "boolq": {"alias": "boolq", "acc,none": 0.5828746177370031, "acc_stderr,none": 0.008624092785001314}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19574119574119575, "acc_stderr,none": 0.011359497363584395}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38169687313284206, "acc_stderr,none": 0.004848099661619699, "acc_norm,none": 0.49063931487751444, "acc_norm_stderr,none": 0.004988906901307742}, "mmlu": {"acc,none": 0.23023785785500642, "acc_stderr,none": 0.003546900728790733, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24378320935175346, "acc_stderr,none": 0.0062574715751615205, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.040406101782088394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.02289916291844581}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2503259452411995, "acc_stderr,none": 0.01106415102716543}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.2404248471194078, "acc_stderr,none": 0.007650239186095163, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23754789272030652, "acc_stderr,none": 0.015218733046150193}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912258}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.025257861359432414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.21741956451088723, "acc_stderr,none": 0.007433074431341765, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.020377660970371386}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936094}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.21249603552172533, "acc_stderr,none": 0.007271218700485504, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17763157894736842, "acc_stderr,none": 0.031103182383123398}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.02094048156533485}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1774193548387097, "acc_stderr,none": 0.02173254068932927}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15270935960591134, "acc_stderr,none": 0.025308904539380627}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.08527260638297872, "exact_match_stderr,custom-extract": 0.0025387951145568405, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09483960948396095, "exact_match_stderr,custom-extract": 0.010949672704790076}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08238276299112801, "exact_match_stderr,custom-extract": 0.009794580713934192}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.045053003533568906, "exact_match_stderr,custom-extract": 0.006167656890375391}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.06585365853658537, "exact_match_stderr,custom-extract": 0.012264102126252811}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11374407582938388, "exact_match_stderr,custom-extract": 0.010935286894179342}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07946336429308566, "exact_match_stderr,custom-extract": 0.008692933034367013}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10024449877750612, "exact_match_stderr,custom-extract": 0.010507066995360123}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.01717316362524469}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06721162579473206, "exact_match_stderr,custom-extract": 0.00754948662630062}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05995558845299778, "exact_match_stderr,custom-extract": 0.00646133318809624}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09740259740259741, "exact_match_stderr,custom-extract": 0.009759587414564102}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12024048096192384, "exact_match_stderr,custom-extract": 0.014574466566661969}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08622016936104696, "exact_match_stderr,custom-extract": 0.0077909043682681525}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11779448621553884, "exact_match_stderr,custom-extract": 0.011418740524805815}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.23, "acc_stderr,none": 0.018839050391123137, "acc_norm,none": 0.346, "acc_norm_stderr,none": 0.021294951277234637}, "piqa": {"alias": "piqa", "acc,none": 0.6496191512513602, "acc_stderr,none": 0.01113127755468173, "acc_norm,none": 0.6583242655059848, "acc_norm_stderr,none": 0.01106553514384152}, "race": {"alias": "race", "acc,none": 0.3349282296650718, "acc_stderr,none": 0.014606961503556257}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.39508700102354144, "acc_stderr,none": 0.011062205128594174}, "winogrande": {"alias": "winogrande", "acc,none": 0.579321231254933, "acc_stderr,none": 0.013874526372008323}} {"created_at": "2025-04-16T23:40:35.537979", "global_step": 102000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2986348122866894, "acc_stderr,none": 0.013374078615068749, "acc_norm,none": 0.3361774744027304, "acc_norm_stderr,none": 0.013804855026205763}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5993265993265994, "acc_stderr,none": 0.010055304474255573, "acc_norm,none": 0.5711279461279462, "acc_norm_stderr,none": 0.010155440652900152}, "boolq": {"alias": "boolq", "acc,none": 0.5871559633027523, "acc_stderr,none": 0.008611172430472871}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1941031941031941, "acc_stderr,none": 0.011323381588920453}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3824935271858196, "acc_stderr,none": 0.004850028813189978, "acc_norm,none": 0.4870543716391157, "acc_norm_stderr,none": 0.004988108663179759}, "mmlu": {"acc,none": 0.22952570858852014, "acc_stderr,none": 0.00354307108548746, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24208289054197663, "acc_stderr,none": 0.0062426684031394305, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.040406101782088394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.02289916291844581}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.24010299324106857, "acc_stderr,none": 0.007645845484601076, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23754789272030652, "acc_stderr,none": 0.015218733046150193}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.0239291555173513}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729903}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.2170945726356841, "acc_stderr,none": 0.007428786285788536, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.020377660970371386}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936094}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.21249603552172533, "acc_stderr,none": 0.007271218700485504, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17763157894736842, "acc_stderr,none": 0.031103182383123398}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.02094048156533485}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1774193548387097, "acc_stderr,none": 0.02173254068932927}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15270935960591134, "acc_stderr,none": 0.025308904539380627}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.09665890957446809, "exact_match_stderr,custom-extract": 0.0026872535902519134, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09623430962343096, "exact_match_stderr,custom-extract": 0.011021390278898282}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12040557667934093, "exact_match_stderr,custom-extract": 0.011593149221103029}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06802120141342756, "exact_match_stderr,custom-extract": 0.007486759168004512}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0951219512195122, "exact_match_stderr,custom-extract": 0.014506870947377841}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1362559241706161, "exact_match_stderr,custom-extract": 0.011815618235249946}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08978328173374613, "exact_match_stderr,custom-extract": 0.009188242804900283}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.09902200488997555, "exact_match_stderr,custom-extract": 0.010449894873209607}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10236220472440945, "exact_match_stderr,custom-extract": 0.015549935163883116}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10899182561307902, "exact_match_stderr,custom-extract": 0.009395966618356963}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.056254626202812734, "exact_match_stderr,custom-extract": 0.0062710412174384065}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10822510822510822, "exact_match_stderr,custom-extract": 0.010225646711914652}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1282565130260521, "exact_match_stderr,custom-extract": 0.01498371136300156}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08237105465742879, "exact_match_stderr,custom-extract": 0.007631036296293892}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11779448621553884, "exact_match_stderr,custom-extract": 0.011418740524805822}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.218, "acc_stderr,none": 0.01848337822317886, "acc_norm,none": 0.334, "acc_norm_stderr,none": 0.021113492347743727}, "piqa": {"alias": "piqa", "acc,none": 0.6556039173014145, "acc_stderr,none": 0.011086521237125625, "acc_norm,none": 0.6572361262241567, "acc_norm_stderr,none": 0.011073978007039314}, "race": {"alias": "race", "acc,none": 0.33588516746411484, "acc_stderr,none": 0.014617286312430686}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.3925281473899693, "acc_stderr,none": 0.011049620449690389}, "winogrande": {"alias": "winogrande", "acc,none": 0.574585635359116, "acc_stderr,none": 0.013895257666646378}} {"created_at": "2025-04-17T01:31:17.443474", "global_step": 104000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2909556313993174, "acc_stderr,none": 0.013273077865907588, "acc_norm,none": 0.3319112627986348, "acc_norm_stderr,none": 0.013760988200880534}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6224747474747475, "acc_stderr,none": 0.009947227833469427, "acc_norm,none": 0.61489898989899, "acc_norm_stderr,none": 0.00998521479873725}, "boolq": {"alias": "boolq", "acc,none": 0.44097859327217126, "acc_stderr,none": 0.008683913982298877}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2153972153972154, "acc_stderr,none": 0.011769690686226967}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38129854610635333, "acc_stderr,none": 0.004847129907908663, "acc_norm,none": 0.49063931487751444, "acc_norm_stderr,none": 0.00498890690130774}, "mmlu": {"acc,none": 0.2299529981484119, "acc_stderr,none": 0.0035453493366500275, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24165781083953242, "acc_stderr,none": 0.0062378599415563755, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.04073524322147125}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2, "acc_stderr,none": 0.031234752377721175}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.02289916291844581}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.24074670099774703, "acc_stderr,none": 0.007652208498068477, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23754789272030652, "acc_stderr,none": 0.015218733046150193}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912258}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729903}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.2183945401364966, "acc_stderr,none": 0.00744589874480811, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.039994238792813344}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551975}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.026653531596715484}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936094}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19591836734693877, "acc_stderr,none": 0.025409301953225678}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.2131303520456708, "acc_stderr,none": 0.007279285247212745, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17763157894736842, "acc_stderr,none": 0.031103182383123398}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.02094048156533485}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1774193548387097, "acc_stderr,none": 0.02173254068932927}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15270935960591134, "acc_stderr,none": 0.025308904539380627}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1574074074074074, "acc_stderr,none": 0.02483717351824239}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.09117353723404255, "exact_match_stderr,custom-extract": 0.0026161547714151617, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09065550906555091, "exact_match_stderr,custom-extract": 0.010730125693788095}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09632446134347275, "exact_match_stderr,custom-extract": 0.010510211344233734}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.03356890459363958, "exact_match_stderr,custom-extract": 0.005355780010493138}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11463414634146342, "exact_match_stderr,custom-extract": 0.01575276269742973}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12796208530805686, "exact_match_stderr,custom-extract": 0.011505210023672562}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08359133126934984, "exact_match_stderr,custom-extract": 0.008895851747412878}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1136919315403423, "exact_match_stderr,custom-extract": 0.011105705318793027}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09711286089238845, "exact_match_stderr,custom-extract": 0.015190193611399451}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07629427792915532, "exact_match_stderr,custom-extract": 0.008004172836965893}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07549962990377498, "exact_match_stderr,custom-extract": 0.00719049968840916}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09848484848484848, "exact_match_stderr,custom-extract": 0.009807772312247526}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12024048096192384, "exact_match_stderr,custom-extract": 0.014574466566661975}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09237875288683603, "exact_match_stderr,custom-extract": 0.008037130651407133}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11904761904761904, "exact_match_stderr,custom-extract": 0.011471162000842543}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.246, "acc_stderr,none": 0.019279819056352555, "acc_norm,none": 0.342, "acc_norm_stderr,none": 0.02123614719989926}, "piqa": {"alias": "piqa", "acc,none": 0.6594124047878128, "acc_stderr,none": 0.011057027540404739, "acc_norm,none": 0.6561479869423286, "acc_norm_stderr,none": 0.011082356277961395}, "race": {"alias": "race", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.014521924541567923}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41760491299897645, "acc_stderr,none": 0.011159391894922484}, "winogrande": {"alias": "winogrande", "acc,none": 0.5887924230465666, "acc_stderr,none": 0.013829128358676866}} {"created_at": "2025-04-17T03:20:00.358866", "global_step": 106000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.29948805460750855, "acc_stderr,none": 0.013385021637313569, "acc_norm,none": 0.3387372013651877, "acc_norm_stderr,none": 0.013830568927974332}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6005892255892256, "acc_stderr,none": 0.010050018228742125, "acc_norm,none": 0.5580808080808081, "acc_norm_stderr,none": 0.010190328123071773}, "boolq": {"alias": "boolq", "acc,none": 0.5788990825688073, "acc_stderr,none": 0.008635491562221343}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20884520884520885, "acc_stderr,none": 0.011637590576063056}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38309101772555265, "acc_stderr,none": 0.0048514666236014505, "acc_norm,none": 0.48844851623182634, "acc_norm_stderr,none": 0.004988449593007262}, "mmlu": {"acc,none": 0.2679105540521293, "acc_stderr,none": 0.0037227733230913835, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2505844845908608, "acc_stderr,none": 0.006310841750785947, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3412698412698413, "acc_stderr,none": 0.04240799327574925}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03346409881055953}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.029554292605695053}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.20253164556962025, "acc_stderr,none": 0.026160568246601457}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19834710743801653, "acc_stderr,none": 0.036401182719909456}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.041331194402438376}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100178}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2379421221864952, "acc_stderr,none": 0.024185150647818707}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.0227797190887334}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2816166883963494, "acc_stderr,none": 0.011487783272786696}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.03158149539338733}, "mmlu_other": {"acc,none": 0.25426456388799484, "acc_stderr,none": 0.007780876102623183, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.046482319871173156}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21132075471698114, "acc_stderr,none": 0.02512576648482785}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.0332055644308557}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.23766816143497757, "acc_stderr,none": 0.028568079464714267}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.044986763205729224}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2120051085568327, "acc_stderr,none": 0.014616099385833688}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.024404394928087863}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25886524822695034, "acc_stderr,none": 0.026129572527180844}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3713235294117647, "acc_stderr,none": 0.02934980313976587}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.033844291552331346}, "mmlu_social_sciences": {"acc,none": 0.287292817679558, "acc_stderr,none": 0.008138112412923004, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281334}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3282828282828283, "acc_stderr,none": 0.033456784227567746}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2849740932642487, "acc_stderr,none": 0.0325771407770966}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.30512820512820515, "acc_stderr,none": 0.023346335293325887}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3319327731092437, "acc_stderr,none": 0.030588697013783663}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3284403669724771, "acc_stderr,none": 0.020135902797298388}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.23366013071895425, "acc_stderr,none": 0.017119158496044506}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.19090909090909092, "acc_stderr,none": 0.03764425585984925}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3183673469387755, "acc_stderr,none": 0.02982253379398207}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.031157150869355554}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.2882968601332065, "acc_stderr,none": 0.008021336275362619, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.037857144650666544}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137282}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462457}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.04617034827006715}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2425531914893617, "acc_stderr,none": 0.028020226271200217}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776554}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3032258064516129, "acc_stderr,none": 0.026148685930671746}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03144712581678242}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036622}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844065}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.37748344370860926, "acc_stderr,none": 0.0395802723112157}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4166666666666667, "acc_stderr,none": 0.03362277436608044}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.17857142857142858, "acc_stderr,none": 0.03635209121577806}, "mmlu_pro": {"exact_match,custom-extract": 0.10206117021276596, "exact_match_stderr,custom-extract": 0.002755728143162264, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11854951185495119, "exact_match_stderr,custom-extract": 0.01208070655224857}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09125475285171103, "exact_match_stderr,custom-extract": 0.010258543729935024}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07420494699646643, "exact_match_stderr,custom-extract": 0.007793679728569114}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.10975609756097561, "exact_match_stderr,custom-extract": 0.015456358358757447}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1362559241706161, "exact_match_stderr,custom-extract": 0.011815618235249946}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10113519091847266, "exact_match_stderr,custom-extract": 0.00969082296121734}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1198044009779951, "exact_match_stderr,custom-extract": 0.011360957995074552}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13385826771653545, "exact_match_stderr,custom-extract": 0.0174672800793266}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10263396911898275, "exact_match_stderr,custom-extract": 0.009150272599112007}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07401924500370097, "exact_match_stderr,custom-extract": 0.007125353603845783}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09848484848484848, "exact_match_stderr,custom-extract": 0.009807772312247517}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12024048096192384, "exact_match_stderr,custom-extract": 0.01457446656666197}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09083910700538876, "exact_match_stderr,custom-extract": 0.00797663013967019}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12280701754385964, "exact_match_stderr,custom-extract": 0.01162599162597017}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.238, "acc_stderr,none": 0.019064072958198446, "acc_norm,none": 0.324, "acc_norm_stderr,none": 0.020950557312477445}, "piqa": {"alias": "piqa", "acc,none": 0.6517954298150164, "acc_stderr,none": 0.0111152263432444, "acc_norm,none": 0.6588683351468988, "acc_norm_stderr,none": 0.011061289443962707}, "race": {"alias": "race", "acc,none": 0.3291866028708134, "acc_stderr,none": 0.01454359226657783}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40890481064483114, "acc_stderr,none": 0.011124710055682836}, "winogrande": {"alias": "winogrande", "acc,none": 0.5445935280189423, "acc_stderr,none": 0.013996485037729775}} {"created_at": "2025-04-17T05:13:50.365367", "global_step": 108000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2790102389078498, "acc_stderr,none": 0.013106784883601341, "acc_norm,none": 0.3319112627986348, "acc_norm_stderr,none": 0.01376098820088054}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6106902356902357, "acc_stderr,none": 0.010005212782878142, "acc_norm,none": 0.585016835016835, "acc_norm_stderr,none": 0.01011038315196113}, "boolq": {"alias": "boolq", "acc,none": 0.6577981651376147, "acc_stderr,none": 0.008298126201987895}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19492219492219492, "acc_stderr,none": 0.011341478090883527}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.04725815626252607}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.37661820354511055, "acc_stderr,none": 0.004835475957610929, "acc_norm,none": 0.4854610635331607, "acc_norm_stderr,none": 0.004987671478640956}, "mmlu": {"acc,none": 0.23928215353938184, "acc_stderr,none": 0.003596004252366481, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2514346439957492, "acc_stderr,none": 0.006319602221579991, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.03192271569548299}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.031660096793998116}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22362869198312235, "acc_stderr,none": 0.027123298205229972}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302871}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25153374233128833, "acc_stderr,none": 0.034089978868575295}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.023445826276545546}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19614147909967847, "acc_stderr,none": 0.02255244778047804}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2191358024691358, "acc_stderr,none": 0.023016705640262203}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2737940026075619, "acc_stderr,none": 0.011388612167979387}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.24557450917283552, "acc_stderr,none": 0.007711325409761621, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.046482319871173156}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.025447863825108604}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641144}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036624}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.1650485436893204, "acc_stderr,none": 0.03675668832233188}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.029480360549541194}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.015302380123542103}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912258}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.026011992930902013}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142317}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.22749431264218395, "acc_stderr,none": 0.007552873778053777, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20707070707070707, "acc_stderr,none": 0.02886977846026705}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24870466321243523, "acc_stderr,none": 0.031195840877700304}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.19230769230769232, "acc_stderr,none": 0.019982347208637296}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.027025433498882374}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1963302752293578, "acc_stderr,none": 0.017030719339154357}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.017776947157528044}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878284}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.21224489795918366, "acc_stderr,none": 0.026176967197866767}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.029705284056772426}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.22645099904852523, "acc_stderr,none": 0.007447744981230568, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19736842105263158, "acc_stderr,none": 0.03238981601699397}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.027678452578212394}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21164021164021163, "acc_stderr,none": 0.021037331505262893}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22258064516129034, "acc_stderr,none": 0.023664216671642518}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.21182266009852216, "acc_stderr,none": 0.028748983689941065}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.025787874220959333}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.03216298420593614}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.02649191472735514}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3392857142857143, "acc_stderr,none": 0.044939490686135404}, "mmlu_pro": {"exact_match,custom-extract": 0.09491356382978723, "exact_match_stderr,custom-extract": 0.002660189945453964, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13389121338912133, "exact_match_stderr,custom-extract": 0.012726405288731838}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10519645120405577, "exact_match_stderr,custom-extract": 0.010929524923270317}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05035335689045936, "exact_match_stderr,custom-extract": 0.006502254002193108}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09268292682926829, "exact_match_stderr,custom-extract": 0.014338963443185472}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1338862559241706, "exact_match_stderr,custom-extract": 0.011728478505969236}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0629514963880289, "exact_match_stderr,custom-extract": 0.007806320646481136}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11246943765281174, "exact_match_stderr,custom-extract": 0.01105345104408259}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11286089238845144, "exact_match_stderr,custom-extract": 0.01623214090346142}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07084468664850137, "exact_match_stderr,custom-extract": 0.007735732733830214}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06661732050333087, "exact_match_stderr,custom-extract": 0.006786667382246556}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12987012987012986, "exact_match_stderr,custom-extract": 0.011064857512116043}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12424849699398798, "exact_match_stderr,custom-extract": 0.014781596611020431}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08314087759815242, "exact_match_stderr,custom-extract": 0.007663395880276777}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12656641604010024, "exact_match_stderr,custom-extract": 0.011777280638403513}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.232, "acc_stderr,none": 0.018896193591952038, "acc_norm,none": 0.356, "acc_norm_stderr,none": 0.02143471235607265}, "piqa": {"alias": "piqa", "acc,none": 0.6583242655059848, "acc_stderr,none": 0.011065535143841523, "acc_norm,none": 0.6479869423286181, "acc_norm_stderr,none": 0.011143148953066092}, "race": {"alias": "race", "acc,none": 0.3492822966507177, "acc_stderr,none": 0.014754834713104495}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4083930399181167, "acc_stderr,none": 0.011122558066098067}, "winogrande": {"alias": "winogrande", "acc,none": 0.5469613259668509, "acc_stderr,none": 0.0139903666321481}} {"created_at": "2025-04-17T07:00:02.197191", "global_step": 110000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2832764505119454, "acc_stderr,none": 0.013167478735134575, "acc_norm,none": 0.3267918088737201, "acc_norm_stderr,none": 0.013706665975587335}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5896464646464646, "acc_stderr,none": 0.010093531255765457, "acc_norm,none": 0.5774410774410774, "acc_norm_stderr,none": 0.01013597822298108}, "boolq": {"alias": "boolq", "acc,none": 0.45932721712538227, "acc_stderr,none": 0.008716073497171078}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20556920556920558, "acc_stderr,none": 0.011569834551534302}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3805018920533758, "acc_stderr,none": 0.004845180034271622, "acc_norm,none": 0.4811790479984067, "acc_norm_stderr,none": 0.0049862451154284535}, "mmlu": {"acc,none": 0.24497934767127189, "acc_stderr,none": 0.0036280290898864664, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2512221041445271, "acc_stderr,none": 0.006322724416808125, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.035670166752768656}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.03317505930009182}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604246}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.02875679962965834}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.041391127276354626}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.040191074725573483}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2883435582822086, "acc_stderr,none": 0.03559039531617342}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2630057803468208, "acc_stderr,none": 0.023703099525258165}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2346368715083799, "acc_stderr,none": 0.014173044098303682}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24437299035369775, "acc_stderr,none": 0.024406162094668893}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2006172839506173, "acc_stderr,none": 0.0222823139497749}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2685788787483703, "acc_stderr,none": 0.011320056629121727}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.25146198830409355, "acc_stderr,none": 0.033275044238468436}, "mmlu_other": {"acc,none": 0.24460894753781784, "acc_stderr,none": 0.007704352183502389, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036622}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.23766816143497757, "acc_stderr,none": 0.028568079464714263}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02934311479809447}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.22988505747126436, "acc_stderr,none": 0.015046301846691814}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.02545775669666788}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23049645390070922, "acc_stderr,none": 0.025123739226872402}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.28308823529411764, "acc_stderr,none": 0.02736586113151381}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.22289156626506024, "acc_stderr,none": 0.03240004825594688}, "mmlu_social_sciences": {"acc,none": 0.24569385765355867, "acc_stderr,none": 0.007759721430479445, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.03646758875075566}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30303030303030304, "acc_stderr,none": 0.03274287914026867}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.030276909945178256}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.0224212736129237}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24369747899159663, "acc_stderr,none": 0.027886828078380558}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22935779816513763, "acc_stderr,none": 0.018025349724618688}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.036412970813137296}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.017952449196987862}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878284}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.21224489795918366, "acc_stderr,none": 0.026176967197866767}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.03076944496729602}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036847}, "mmlu_stem": {"acc,none": 0.2353314303837615, "acc_stderr,none": 0.007553804848004564, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640767}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179964}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.02767845257821239}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.02053948126188688}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2032258064516129, "acc_stderr,none": 0.02289168798455497}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.24630541871921183, "acc_stderr,none": 0.030315099285617722}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21851851851851853, "acc_stderr,none": 0.025195752251823793}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763743}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.027920963147993662}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "mmlu_pro": {"exact_match,custom-extract": 0.09050864361702128, "exact_match_stderr,custom-extract": 0.0026002662106765968, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.10599721059972106, "exact_match_stderr,custom-extract": 0.011504298561744805}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10519645120405577, "exact_match_stderr,custom-extract": 0.010929524923270312}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.037102473498233215, "exact_match_stderr,custom-extract": 0.005620308630822641}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0951219512195122, "exact_match_stderr,custom-extract": 0.014506870947377836}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.13151658767772512, "exact_match_stderr,custom-extract": 0.011640114243452654}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.06501547987616099, "exact_match_stderr,custom-extract": 0.007924519124852925}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1234718826405868, "exact_match_stderr,custom-extract": 0.011509486100746407}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11548556430446194, "exact_match_stderr,custom-extract": 0.016395494305781078}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07629427792915532, "exact_match_stderr,custom-extract": 0.008004172836965895}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0458919319022946, "exact_match_stderr,custom-extract": 0.005695083161267806}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13636363636363635, "exact_match_stderr,custom-extract": 0.011295719428226613}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11222444889779559, "exact_match_stderr,custom-extract": 0.014144273960803925}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08083140877598152, "exact_match_stderr,custom-extract": 0.0075657208918047236}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12155388471177944, "exact_match_stderr,custom-extract": 0.01157478210191104}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.228, "acc_stderr,none": 0.018781306529363204, "acc_norm,none": 0.34, "acc_norm_stderr,none": 0.021206117013673066}, "piqa": {"alias": "piqa", "acc,none": 0.6702937976060935, "acc_stderr,none": 0.010968357083095152, "acc_norm,none": 0.6561479869423286, "acc_norm_stderr,none": 0.011082356277961393}, "race": {"alias": "race", "acc,none": 0.33875598086124403, "acc_stderr,none": 0.014647857789710087}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4032753326509724, "acc_stderr,none": 0.011100350776719524}, "winogrande": {"alias": "winogrande", "acc,none": 0.5777426992896606, "acc_stderr,none": 0.013881582030658549}} {"created_at": "2025-04-17T08:49:40.816526", "global_step": 112000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3054607508532423, "acc_stderr,none": 0.013460080478002496, "acc_norm,none": 0.32849829351535836, "acc_norm_stderr,none": 0.013724978465537361}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5867003367003367, "acc_stderr,none": 0.010104361780747513, "acc_norm,none": 0.5513468013468014, "acc_norm_stderr,none": 0.010205540414612867}, "boolq": {"alias": "boolq", "acc,none": 0.5314984709480123, "acc_stderr,none": 0.008727684848615308}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19164619164619165, "acc_stderr,none": 0.01126862497880165}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.37661820354511055, "acc_stderr,none": 0.004835475957610927, "acc_norm,none": 0.4856602270464051, "acc_norm_stderr,none": 0.004987728900897595}, "mmlu": {"acc,none": 0.2335137444808432, "acc_stderr,none": 0.0035657477754138366, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2363443145589798, "acc_stderr,none": 0.00619642634772854, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.039325376803928704}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139404}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.03019028245350194}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658335}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516301}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.023176298203992012}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2191358024691358, "acc_stderr,none": 0.023016705640262196}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23989569752281617, "acc_stderr,none": 0.010906282617981647}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.031267817146631786}, "mmlu_other": {"acc,none": 0.25458641776633406, "acc_stderr,none": 0.007799569554278547, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.02544786382510861}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.030631145539198826}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3273542600896861, "acc_stderr,none": 0.03149384670994131}, "mmlu_management": {"alias": " - management", "acc,none": 0.20388349514563106, "acc_stderr,none": 0.0398913985953177}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3034188034188034, "acc_stderr,none": 0.030118210106942645}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2554278416347382, "acc_stderr,none": 0.015594955384455758}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.02555316999182652}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2198581560283688, "acc_stderr,none": 0.024706141070705477}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22058823529411764, "acc_stderr,none": 0.02518778666022727}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553027}, "mmlu_social_sciences": {"acc,none": 0.22554436139096523, "acc_stderr,none": 0.007537665977530656, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748142}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.027479603010538787}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21761658031088082, "acc_stderr,none": 0.02977866303775296}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21025641025641026, "acc_stderr,none": 0.020660597485026924}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275886}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21100917431192662, "acc_stderr,none": 0.017493922404112648}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724138}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24081632653061225, "acc_stderr,none": 0.027372942201788167}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22388059701492538, "acc_stderr,none": 0.02947525023601717}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.21630193466539804, "acc_stderr,none": 0.007313975984322235, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.032790004063100515}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292323}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.19576719576719576, "acc_stderr,none": 0.02043573097154179}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2032258064516129, "acc_stderr,none": 0.022891687984554963}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1477832512315271, "acc_stderr,none": 0.024969621333521274}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.02438843043398766}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1574074074074074, "acc_stderr,none": 0.02483717351824239}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.08826462765957446, "exact_match_stderr,custom-extract": 0.002576031379947453, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.10320781032078104, "exact_match_stderr,custom-extract": 0.011369612924724331}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10899873257287707, "exact_match_stderr,custom-extract": 0.011101630697795399}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.038869257950530034, "exact_match_stderr,custom-extract": 0.005747289272769039}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.06585365853658537, "exact_match_stderr,custom-extract": 0.012264102126252825}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.13033175355450238, "exact_match_stderr,custom-extract": 0.011595464178953758}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07533539731682147, "exact_match_stderr,custom-extract": 0.00848308813619093}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.097799511002445, "exact_match_stderr,custom-extract": 0.010392232214102676}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09973753280839895, "exact_match_stderr,custom-extract": 0.015371706524248092}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07356948228882834, "exact_match_stderr,custom-extract": 0.007871525990705135}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.056254626202812734, "exact_match_stderr,custom-extract": 0.006271041217438407}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11038961038961038, "exact_match_stderr,custom-extract": 0.010314856083401148}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1282565130260521, "exact_match_stderr,custom-extract": 0.014983711363001556}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08852963818321787, "exact_match_stderr,custom-extract": 0.007884574735319066}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11528822055137844, "exact_match_stderr,custom-extract": 0.011312646389022088}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.216, "acc_stderr,none": 0.01842190906141194, "acc_norm,none": 0.344, "acc_norm_stderr,none": 0.02126575803797874}, "piqa": {"alias": "piqa", "acc,none": 0.6528835690968444, "acc_stderr,none": 0.011107104993128085, "acc_norm,none": 0.6588683351468988, "acc_norm_stderr,none": 0.011061289443962705}, "race": {"alias": "race", "acc,none": 0.3435406698564593, "acc_stderr,none": 0.014697475413671397}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4109518935516888, "acc_stderr,none": 0.011133193398910174}, "winogrande": {"alias": "winogrande", "acc,none": 0.5761641673243884, "acc_stderr,none": 0.013888492389944506}} {"created_at": "2025-04-17T10:38:23.616091", "global_step": 114000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2909556313993174, "acc_stderr,none": 0.013273077865907585, "acc_norm,none": 0.35409556313993173, "acc_norm_stderr,none": 0.013975454122756557}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6018518518518519, "acc_stderr,none": 0.010044662374653401, "acc_norm,none": 0.5824915824915825, "acc_norm_stderr,none": 0.010119187377776024}, "boolq": {"alias": "boolq", "acc,none": 0.536697247706422, "acc_stderr,none": 0.00872146945015999}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20638820638820637, "acc_stderr,none": 0.011586881879177842}, "copa": {"alias": "copa", "acc,none": 0.66, "acc_stderr,none": 0.04760952285695238}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38040231029675364, "acc_stderr,none": 0.0048449353275992, "acc_norm,none": 0.48914558852818163, "acc_norm_stderr,none": 0.004988605498273901}, "mmlu": {"acc,none": 0.2439111237715425, "acc_stderr,none": 0.0036177932642722822, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24463336875664188, "acc_stderr,none": 0.00626510107745337, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.15873015873015872, "acc_stderr,none": 0.03268454013011744}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.033175059300091805}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.029771775228145638}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.31223628691983124, "acc_stderr,none": 0.03016513786784701}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.20245398773006135, "acc_stderr,none": 0.03157065078911902}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24022346368715083, "acc_stderr,none": 0.014288343803925305}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2379421221864952, "acc_stderr,none": 0.024185150647818707}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.27469135802469136, "acc_stderr,none": 0.024836057868294684}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2438070404172099, "acc_stderr,none": 0.010966507972178475}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946956}, "mmlu_other": {"acc,none": 0.2568393949147087, "acc_stderr,none": 0.007821919394402937, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.02575755989310673}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483099}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.3106796116504854, "acc_stderr,none": 0.04582124160161551}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.029480360549541194}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25287356321839083, "acc_stderr,none": 0.01554337731371968}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912248}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729906}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1801470588235294, "acc_stderr,none": 0.02334516361654486}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.30120481927710846, "acc_stderr,none": 0.03571609230053481}, "mmlu_social_sciences": {"acc,none": 0.23724406889827754, "acc_stderr,none": 0.007666136103650804, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893624}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02962022787479049}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.03003114797764154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23846153846153847, "acc_stderr,none": 0.02160629449464773}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.18067226890756302, "acc_stderr,none": 0.024991964966600742}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25321100917431194, "acc_stderr,none": 0.018644073041375043}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.1984732824427481, "acc_stderr,none": 0.034981493854624686}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.238562091503268, "acc_stderr,none": 0.017242385828779606}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.33636363636363636, "acc_stderr,none": 0.04525393596302505}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2163265306122449, "acc_stderr,none": 0.026358916334904055}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.03076944496729602}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_stem": {"acc,none": 0.2366000634316524, "acc_stderr,none": 0.007537143884360067, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.039154506304142495}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.03197565821032499}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.035146974678623884}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.16, "acc_stderr,none": 0.036845294917747094}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.09, "acc_stderr,none": 0.02876234912646613}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.03708284662416545}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.02865917937429232}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.32413793103448274, "acc_stderr,none": 0.03900432069185555}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.022930973071633363}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22580645161290322, "acc_stderr,none": 0.023785577884181012}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.26108374384236455, "acc_stderr,none": 0.030903796952114475}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.02620276653465215}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.034454062719870546}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.14814814814814814, "acc_stderr,none": 0.024227629273728363}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467764}, "mmlu_pro": {"exact_match,custom-extract": 0.09466422872340426, "exact_match_stderr,custom-extract": 0.002661692449649715, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11576011157601115, "exact_match_stderr,custom-extract": 0.011956608475421746}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11280101394169835, "exact_match_stderr,custom-extract": 0.011269480888070133}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05123674911660778, "exact_match_stderr,custom-extract": 0.006555991897359253}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08048780487804878, "exact_match_stderr,custom-extract": 0.013451853667809174}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1018957345971564, "exact_match_stderr,custom-extract": 0.01041903734075382}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07120743034055728, "exact_match_stderr,custom-extract": 0.008265789561265733}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11246943765281174, "exact_match_stderr,custom-extract": 0.011053451044082592}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10236220472440945, "exact_match_stderr,custom-extract": 0.015549935163883102}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09445958219800181, "exact_match_stderr,custom-extract": 0.008818213049704723}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06439674315321983, "exact_match_stderr,custom-extract": 0.006680530175483018}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11796536796536797, "exact_match_stderr,custom-extract": 0.010617425726799206}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1062124248496994, "exact_match_stderr,custom-extract": 0.01380670822762699}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10007698229407236, "exact_match_stderr,custom-extract": 0.008329758962147563}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13408521303258145, "exact_match_stderr,custom-extract": 0.012069766280503623}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.224, "acc_stderr,none": 0.018663994464710808, "acc_norm,none": 0.336, "acc_norm_stderr,none": 0.02114479142504886}, "piqa": {"alias": "piqa", "acc,none": 0.6659412404787813, "acc_stderr,none": 0.011004613886336738, "acc_norm,none": 0.6643090315560392, "acc_norm_stderr,none": 0.011017938116656318}, "race": {"alias": "race", "acc,none": 0.3320574162679426, "acc_stderr,none": 0.014575582129545918}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4155578300921187, "acc_stderr,none": 0.011151553840954866}, "winogrande": {"alias": "winogrande", "acc,none": 0.5737963693764798, "acc_stderr,none": 0.013898585965412338}} {"created_at": "2025-04-19T07:01:31.187539", "global_step": 124000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.29692832764505117, "acc_stderr,none": 0.013352025976725222, "acc_norm,none": 0.3378839590443686, "acc_norm_stderr,none": 0.013822047922283512}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6254208754208754, "acc_stderr,none": 0.009931758820410617, "acc_norm,none": 0.5976430976430976, "acc_norm_stderr,none": 0.010062244711011524}, "boolq": {"alias": "boolq", "acc,none": 0.6015290519877676, "acc_stderr,none": 0.008562866533340565}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2072072072072072, "acc_stderr,none": 0.011603856781422558}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.047258156262526066}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38129854610635333, "acc_stderr,none": 0.004847129907908663, "acc_norm,none": 0.4843656642103167, "acc_norm_stderr,none": 0.004987341485856658}, "mmlu": {"acc,none": 0.2691924227318046, "acc_stderr,none": 0.003732441437830391, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24760892667375134, "acc_stderr,none": 0.006294904029763624, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.040061680838488774}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.296969696969697, "acc_stderr,none": 0.035679697722680474}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.03077855467869326}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.23628691983122363, "acc_stderr,none": 0.02765215314415927}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.03849856098794088}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.039578354719809784}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.0335195387952127}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23410404624277456, "acc_stderr,none": 0.022797110278071138}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2508038585209003, "acc_stderr,none": 0.024619771956697165}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.022899162918445796}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25945241199478486, "acc_stderr,none": 0.011195262076350309}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.19883040935672514, "acc_stderr,none": 0.03061111655743253}, "mmlu_other": {"acc,none": 0.2735757965883489, "acc_stderr,none": 0.007973331930451964, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542129}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2943396226415094, "acc_stderr,none": 0.028049186315695245}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.035331333893236574}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2556053811659193, "acc_stderr,none": 0.029275891003969927}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.044986763205729224}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.029058588303748842}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24010217113665389, "acc_stderr,none": 0.015274685213734195}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.026090162504279035}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290403}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3713235294117647, "acc_stderr,none": 0.02934980313976587}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.22289156626506024, "acc_stderr,none": 0.03240004825594688}, "mmlu_social_sciences": {"acc,none": 0.28631784205394867, "acc_stderr,none": 0.008126158884530353, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.04227054451232199}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.26262626262626265, "acc_stderr,none": 0.031353050095330855}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3316062176165803, "acc_stderr,none": 0.03397636541089116}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.28717948717948716, "acc_stderr,none": 0.022939925418530616}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2773109243697479, "acc_stderr,none": 0.02907937453948001}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27889908256880735, "acc_stderr,none": 0.019227468876463517}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.017776947157528037}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644286}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.4163265306122449, "acc_stderr,none": 0.031557828165561644}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_stem": {"acc,none": 0.28036790358388836, "acc_stderr,none": 0.007966117278784948, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.31851851851851853, "acc_stderr,none": 0.04024778401977111}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351585}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462457}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.04655010411319616}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653697}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2170212765957447, "acc_stderr,none": 0.02694748312149623}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.034559302019248124}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.022418042891113935}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2806451612903226, "acc_stderr,none": 0.0255606047210229}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.03194740072265541}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.026202766534652148}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.3443708609271523, "acc_stderr,none": 0.038796870240733264}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.39814814814814814, "acc_stderr,none": 0.033384734032074016}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.20535714285714285, "acc_stderr,none": 0.03834241021419073}, "mmlu_pro": {"exact_match,custom-extract": 0.08768284574468085, "exact_match_stderr,custom-extract": 0.0025708400018798778, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.10181311018131102, "exact_match_stderr,custom-extract": 0.011301307630409364}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09759188846641319, "exact_match_stderr,custom-extract": 0.010571710152486615}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05565371024734982, "exact_match_stderr,custom-extract": 0.006816813274624111}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0951219512195122, "exact_match_stderr,custom-extract": 0.0145068709473778}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.09597156398104266, "exact_match_stderr,custom-extract": 0.010144918080327492}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08668730650154799, "exact_match_stderr,custom-extract": 0.00904377653422992}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10880195599022005, "exact_match_stderr,custom-extract": 0.010894177212595786}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10761154855643044, "exact_match_stderr,custom-extract": 0.01589697945272338}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08174386920980926, "exact_match_stderr,custom-extract": 0.008260630014297237}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.04145077720207254, "exact_match_stderr,custom-extract": 0.005425086466040211}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10822510822510822, "exact_match_stderr,custom-extract": 0.01022564671191463}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13827655310621242, "exact_match_stderr,custom-extract": 0.015468334539576873}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08468052347959969, "exact_match_stderr,custom-extract": 0.007727531295642343}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10401002506265664, "exact_match_stderr,custom-extract": 0.010813343895839506}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.234, "acc_stderr,none": 0.01895274156489368, "acc_norm,none": 0.354, "acc_norm_stderr,none": 0.021407582047916447}, "piqa": {"alias": "piqa", "acc,none": 0.6686615886833515, "acc_stderr,none": 0.010982077458957346, "acc_norm,none": 0.6675734494015234, "acc_norm_stderr,none": 0.010991141557445596}, "race": {"alias": "race", "acc,none": 0.3368421052631579, "acc_stderr,none": 0.01462754386904513}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.406345957011259, "acc_stderr,none": 0.011113825275480054}, "winogrande": {"alias": "winogrande", "acc,none": 0.5974743488555643, "acc_stderr,none": 0.013782866831703044}} {"created_at": "2025-04-19T07:08:22.842964", "global_step": 122000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2986348122866894, "acc_stderr,none": 0.013374078615068745, "acc_norm,none": 0.3319112627986348, "acc_norm_stderr,none": 0.013760988200880538}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6052188552188552, "acc_stderr,none": 0.01003003893588358, "acc_norm,none": 0.5879629629629629, "acc_norm_stderr,none": 0.010099765857562764}, "boolq": {"alias": "boolq", "acc,none": 0.5024464831804282, "acc_stderr,none": 0.00874495029256737}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18263718263718265, "acc_stderr,none": 0.0110617062892271}, "copa": {"alias": "copa", "acc,none": 0.65, "acc_stderr,none": 0.0479372485441102}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3819956184027086, "acc_stderr,none": 0.004848824710995941, "acc_norm,none": 0.4902409878510257, "acc_norm_stderr,none": 0.004988830884131624}, "mmlu": {"acc,none": 0.24740065517732518, "acc_stderr,none": 0.0036373544731932816, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25823591923485656, "acc_stderr,none": 0.00638142290187731, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.040061680838488774}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.03453131801885415}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693257}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.31223628691983124, "acc_stderr,none": 0.030165137867847008}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25153374233128833, "acc_stderr,none": 0.03408997886857529}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.02335736578587403}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.26256983240223464, "acc_stderr,none": 0.014716824273017752}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.20257234726688103, "acc_stderr,none": 0.02282731749105969}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24382716049382716, "acc_stderr,none": 0.02389187954195961}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26010430247718386, "acc_stderr,none": 0.01120438288782383}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824565}, "mmlu_other": {"acc,none": 0.25458641776633406, "acc_stderr,none": 0.007799241250595875, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.026616482980501704}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.031265112061730424}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3273542600896861, "acc_stderr,none": 0.03149384670994131}, "mmlu_management": {"alias": " - management", "acc,none": 0.20388349514563106, "acc_stderr,none": 0.03989139859531769}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3162393162393162, "acc_stderr,none": 0.03046365674734026}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398679}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.02440439492808787}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.28368794326241137, "acc_stderr,none": 0.026891709428343957}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.26506024096385544, "acc_stderr,none": 0.03436024037944967}, "mmlu_social_sciences": {"acc,none": 0.23691907702307444, "acc_stderr,none": 0.00766250849750746, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.042270544512321984}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21717171717171718, "acc_stderr,none": 0.029376616484945633}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2512820512820513, "acc_stderr,none": 0.021992016662370547}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275882}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21467889908256882, "acc_stderr,none": 0.017604304149256487}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467765}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2163265306122449, "acc_stderr,none": 0.026358916334904038}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22388059701492538, "acc_stderr,none": 0.02947525023601718}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_stem": {"acc,none": 0.23437995559784333, "acc_stderr,none": 0.007532608593753215, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.038201699145179055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325004}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.03309615177059006}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.037932811853078084}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3148936170212766, "acc_stderr,none": 0.030363582197238167}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.036001056927277716}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776568}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2161290322580645, "acc_stderr,none": 0.023415293433568535}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.028501378167893946}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.02620276653465215}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.035118075718047245}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1712962962962963, "acc_stderr,none": 0.025695341643824688}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.20535714285714285, "acc_stderr,none": 0.038342410214190714}, "mmlu_pro": {"exact_match,custom-extract": 0.08136635638297872, "exact_match_stderr,custom-extract": 0.0024824379578640057, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09902370990237098, "exact_match_stderr,custom-extract": 0.011162713195868411}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.07477820025348543, "exact_match_stderr,custom-extract": 0.009370166843528007}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.03975265017667844, "exact_match_stderr,custom-extract": 0.005809560779109003}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.06341463414634146, "exact_match_stderr,custom-extract": 0.012050547403328608}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.0995260663507109, "exact_match_stderr,custom-extract": 0.010310748774630962}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.06191950464396285, "exact_match_stderr,custom-extract": 0.007746332082478447}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.0941320293398533, "exact_match_stderr,custom-extract": 0.01021621760101336}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11548556430446194, "exact_match_stderr,custom-extract": 0.016395494305781095}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07538601271571299, "exact_match_stderr,custom-extract": 0.007960297036631295}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.056254626202812734, "exact_match_stderr,custom-extract": 0.006271041217438408}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10173160173160173, "exact_match_stderr,custom-extract": 0.00995016199178466}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13226452905811623, "exact_match_stderr,custom-extract": 0.015181011139551248}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.06928406466512702, "exact_match_stderr,custom-extract": 0.007048357240585414}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13032581453634084, "exact_match_stderr,custom-extract": 0.011925163793228668}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.212, "acc_stderr,none": 0.01829703700401389, "acc_norm,none": 0.332, "acc_norm_stderr,none": 0.02108176657122286}, "piqa": {"alias": "piqa", "acc,none": 0.6675734494015234, "acc_stderr,none": 0.010991141557445596, "acc_norm,none": 0.6632208922742111, "acc_norm_stderr,none": 0.011026738925251182}, "race": {"alias": "race", "acc,none": 0.3464114832535885, "acc_stderr,none": 0.014726451021782801}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4083930399181167, "acc_stderr,none": 0.011122558066098066}, "winogrande": {"alias": "winogrande", "acc,none": 0.5706393054459353, "acc_stderr,none": 0.013911537499969172}} {"created_at": "2025-04-19T07:17:20.021654", "global_step": 126000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.30887372013651876, "acc_stderr,none": 0.013501770929344003, "acc_norm,none": 0.3293515358361775, "acc_norm_stderr,none": 0.013734057652635474}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6220538720538721, "acc_stderr,none": 0.009949405744045469, "acc_norm,none": 0.6077441077441077, "acc_norm_stderr,none": 0.010018744689650043}, "boolq": {"alias": "boolq", "acc,none": 0.454434250764526, "acc_stderr,none": 0.008708665643758018}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202903}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38398725353515234, "acc_stderr,none": 0.004853608805843877, "acc_norm,none": 0.4902409878510257, "acc_norm_stderr,none": 0.004988830884131619}, "mmlu": {"acc,none": 0.24711579547073068, "acc_stderr,none": 0.0036398437016752078, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25823591923485656, "acc_stderr,none": 0.006383948588445234, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.039325376803928676}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.031922715695482974}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.031321798030832904}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25316455696202533, "acc_stderr,none": 0.028304657943035313}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302872}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.042365112580946336}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.19631901840490798, "acc_stderr,none": 0.031207970394709215}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2630057803468208, "acc_stderr,none": 0.023703099525258165}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808855}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2604501607717042, "acc_stderr,none": 0.02492672322484555}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25308641975308643, "acc_stderr,none": 0.024191808600713}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2737940026075619, "acc_stderr,none": 0.011388612167979388}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03377310252209194}, "mmlu_other": {"acc,none": 0.24396523978113938, "acc_stderr,none": 0.007708118222713632, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2528301886792453, "acc_stderr,none": 0.026749899771241235}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.19653179190751446, "acc_stderr,none": 0.030299574664788147}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2645739910313901, "acc_stderr,none": 0.029605103217038315}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.028911208802749482}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23116219667943805, "acc_stderr,none": 0.015075523238101088}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.025261691219729487}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.026469036818590634}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20955882352941177, "acc_stderr,none": 0.024723110407677065}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.03175554786629919}, "mmlu_social_sciences": {"acc,none": 0.24049398765030874, "acc_stderr,none": 0.0077023151757698305, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.040493392977481425}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2676767676767677, "acc_stderr,none": 0.03154449888270285}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23316062176165803, "acc_stderr,none": 0.030516111371476008}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2076923076923077, "acc_stderr,none": 0.020567539567246794}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.027553614467863797}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22752293577981653, "acc_stderr,none": 0.0179744635787765}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287414}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.017593486895366835}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.041723430387053825}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.30612244897959184, "acc_stderr,none": 0.02950489645459597}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.18407960199004975, "acc_stderr,none": 0.027403859410786845}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_stem": {"acc,none": 0.24008880431335236, "acc_stderr,none": 0.0076038878002088394, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.03820169914517905}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17105263157894737, "acc_stderr,none": 0.03064360707167709}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.03514697467862388}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.24680851063829787, "acc_stderr,none": 0.028185441301234092}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776568}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25161290322580643, "acc_stderr,none": 0.024685979286239956}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1921182266009852, "acc_stderr,none": 0.02771931570961477}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.02578787422095933}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.0347918557259966}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.026991454502036733}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.04246624336697623}, "mmlu_pro": {"exact_match,custom-extract": 0.08851396276595745, "exact_match_stderr,custom-extract": 0.0025818131045224073, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.07531380753138076, "exact_match_stderr,custom-extract": 0.009862294734961797}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11406844106463879, "exact_match_stderr,custom-extract": 0.011324518110214692}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05565371024734982, "exact_match_stderr,custom-extract": 0.006816813274624114}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0951219512195122, "exact_match_stderr,custom-extract": 0.014506870947377805}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1066350710900474, "exact_match_stderr,custom-extract": 0.010630426613851838}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07946336429308566, "exact_match_stderr,custom-extract": 0.00869293303436701}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11613691931540342, "exact_match_stderr,custom-extract": 0.011208993552473897}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12073490813648294, "exact_match_stderr,custom-extract": 0.016714159620683282}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07084468664850137, "exact_match_stderr,custom-extract": 0.007735732733830227}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.052553663952627686, "exact_match_stderr,custom-extract": 0.006073120886501437}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12229437229437229, "exact_match_stderr,custom-extract": 0.01078392421390481}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10020040080160321, "exact_match_stderr,custom-extract": 0.013455286690416086}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08545034642032333, "exact_match_stderr,custom-extract": 0.0077593119520955435}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11027568922305764, "exact_match_stderr,custom-extract": 0.01109528490117259}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.206, "acc_stderr,none": 0.01810479403733354, "acc_norm,none": 0.346, "acc_norm_stderr,none": 0.021294951277234637}, "piqa": {"alias": "piqa", "acc,none": 0.6463547334058759, "acc_stderr,none": 0.011154877708188675, "acc_norm,none": 0.6577801958650707, "acc_norm_stderr,none": 0.011069764658685451}, "race": {"alias": "race", "acc,none": 0.34258373205741627, "acc_stderr,none": 0.014687684737145162}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4078812691914023, "acc_stderr,none": 0.011120393600595453}, "winogrande": {"alias": "winogrande", "acc,none": 0.5887924230465666, "acc_stderr,none": 0.013829128358676876}} {"created_at": "2025-04-19T13:23:42.151293", "global_step": 128000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3097269624573379, "acc_stderr,none": 0.013512058415238363, "acc_norm,none": 0.34812286689419797, "acc_norm_stderr,none": 0.01392100859517934}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6199494949494949, "acc_stderr,none": 0.00996017583149313, "acc_norm,none": 0.6153198653198653, "acc_norm_stderr,none": 0.00998317170700901}, "boolq": {"alias": "boolq", "acc,none": 0.5492354740061162, "acc_stderr,none": 0.008702553362422875}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091194}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.047258156262526066}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3881696873132842, "acc_stderr,none": 0.004863375698153869, "acc_norm,none": 0.4922326229834694, "acc_norm_stderr,none": 0.004989179286677388}, "mmlu": {"acc,none": 0.2684090585386697, "acc_stderr,none": 0.003738058933167412, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2599362380446334, "acc_stderr,none": 0.006390229752363171, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.373015873015873, "acc_stderr,none": 0.043255060420170854}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.036085410115739666}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.029554292605695066}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.039849796533028725}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.023445826276545543}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25251396648044694, "acc_stderr,none": 0.014530330201468655}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24758842443729903, "acc_stderr,none": 0.024513879973621967}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24691358024691357, "acc_stderr,none": 0.023993501709042114}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2737940026075619, "acc_stderr,none": 0.01138861216797938}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.19298245614035087, "acc_stderr,none": 0.03026745755489847}, "mmlu_other": {"acc,none": 0.2719665271966527, "acc_stderr,none": 0.007989824021824367, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2830188679245283, "acc_stderr,none": 0.0277242364927009}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.25112107623318386, "acc_stderr,none": 0.029105220833224615}, "mmlu_management": {"alias": " - management", "acc,none": 0.3106796116504854, "acc_stderr,none": 0.0458212416016155}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24786324786324787, "acc_stderr,none": 0.0282863240755644}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2835249042145594, "acc_stderr,none": 0.01611731816683228}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.024954184324879915}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2907801418439716, "acc_stderr,none": 0.027090664368353178}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.026799562024887674}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370518}, "mmlu_social_sciences": {"acc,none": 0.28761780955476113, "acc_stderr,none": 0.008160560612283672, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.044346007015849245}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.35858585858585856, "acc_stderr,none": 0.034169036403915214}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2694300518134715, "acc_stderr,none": 0.03201867122877793}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.30512820512820515, "acc_stderr,none": 0.023346335293325887}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.28991596638655465, "acc_stderr,none": 0.02947248583313608}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.28990825688073396, "acc_stderr,none": 0.019453066609201597}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.018054027458815194}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.043091187099464585}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2979591836734694, "acc_stderr,none": 0.02927956741106567}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.030965903123573026}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_stem": {"acc,none": 0.2588011417697431, "acc_stderr,none": 0.007797479405704096, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.03820169914517905}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.037385206761196686}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.04576665403207761}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342343}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.01989487936717555}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2967741935483871, "acc_stderr,none": 0.025988500792411884}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.031447125816782426}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.03710185726119994}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25462962962962965, "acc_stderr,none": 0.029711275860005344}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755807}, "mmlu_pro": {"exact_match,custom-extract": 0.09333444148936171, "exact_match_stderr,custom-extract": 0.0026435557995661426, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.07531380753138076, "exact_match_stderr,custom-extract": 0.009862294734961754}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10012674271229405, "exact_match_stderr,custom-extract": 0.010693074879962142}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.04240282685512368, "exact_match_stderr,custom-extract": 0.00599180306057588}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09268292682926829, "exact_match_stderr,custom-extract": 0.014338963443185472}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10426540284360189, "exact_match_stderr,custom-extract": 0.010525579113489895}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07430340557275542, "exact_match_stderr,custom-extract": 0.008429484453629038}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13080684596577016, "exact_match_stderr,custom-extract": 0.011796749495986723}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12598425196850394, "exact_match_stderr,custom-extract": 0.017022602638569514}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10990009082652134, "exact_match_stderr,custom-extract": 0.009430225142537548}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07772020725388601, "exact_match_stderr,custom-extract": 0.007286709191616169}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11147186147186147, "exact_match_stderr,custom-extract": 0.010358988935082428}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11222444889779559, "exact_match_stderr,custom-extract": 0.014144273960803885}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07775211701308699, "exact_match_stderr,custom-extract": 0.007432631448995878}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12907268170426064, "exact_match_stderr,custom-extract": 0.011876239922954071}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.22, "acc_stderr,none": 0.01854421137582033, "acc_norm,none": 0.33, "acc_norm_stderr,none": 0.02104961216613481}, "piqa": {"alias": "piqa", "acc,none": 0.6692056583242655, "acc_stderr,none": 0.010977520584714441, "acc_norm,none": 0.6751904243743199, "acc_norm_stderr,none": 0.010926296238294034}, "race": {"alias": "race", "acc,none": 0.3320574162679426, "acc_stderr,none": 0.014575582129545916}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4186284544524053, "acc_stderr,none": 0.011163236779233962}, "winogrande": {"alias": "winogrande", "acc,none": 0.5887924230465666, "acc_stderr,none": 0.01382912835867687}} {"created_at": "2025-04-19T16:51:32.536197", "global_step": 136000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.28754266211604096, "acc_stderr,none": 0.013226719056266132, "acc_norm,none": 0.3430034129692833, "acc_norm_stderr,none": 0.013872423223718166}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6279461279461279, "acc_stderr,none": 0.009918187193096482, "acc_norm,none": 0.6127946127946128, "acc_norm_stderr,none": 0.009995312065890353}, "boolq": {"alias": "boolq", "acc,none": 0.39327217125382263, "acc_stderr,none": 0.008543505537417864}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19656019656019655, "acc_stderr,none": 0.011377439773964002}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38717386974706236, "acc_stderr,none": 0.004861084534087033, "acc_norm,none": 0.4993029277036447, "acc_norm_stderr,none": 0.00498977656227611}, "mmlu": {"acc,none": 0.2597208374875374, "acc_stderr,none": 0.0036918516348289424, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2561105207226355, "acc_stderr,none": 0.00636811816595445, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.03852273364924315}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624335}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604246}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25153374233128833, "acc_stderr,none": 0.034089978868575295}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2335195530726257, "acc_stderr,none": 0.014149575348976267}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2733118971061093, "acc_stderr,none": 0.025311765975426122}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25308641975308643, "acc_stderr,none": 0.024191808600713}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26727509778357234, "acc_stderr,none": 0.011302607515637528}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.03301405946987251}, "mmlu_other": {"acc,none": 0.24750563244287094, "acc_stderr,none": 0.007715727351329334, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.026616482980501715}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.032424147574830996}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.19282511210762332, "acc_stderr,none": 0.026478240960489365}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646034}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24786324786324787, "acc_stderr,none": 0.028286324075564383}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.22094508301404853, "acc_stderr,none": 0.014836205167333574}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.024630048979824758}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.02601199293090201}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.38235294117647056, "acc_stderr,none": 0.02952009569768776}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233134}, "mmlu_social_sciences": {"acc,none": 0.2674683132921677, "acc_stderr,none": 0.007942903200493023, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436695}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3434343434343434, "acc_stderr,none": 0.033832012232444426}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3160621761658031, "acc_stderr,none": 0.033553973696861736}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.021444547301560476}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20168067226890757, "acc_stderr,none": 0.026064313406304527}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26422018348623855, "acc_stderr,none": 0.018904164171510193}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768362}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.017776947157528037}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.17272727272727273, "acc_stderr,none": 0.03620691833929219}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.37142857142857144, "acc_stderr,none": 0.03093285879278986}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.208955223880597, "acc_stderr,none": 0.028748298931728655}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_stem": {"acc,none": 0.2695845226768157, "acc_stderr,none": 0.007882973172661897, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066655}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3355263157894737, "acc_stderr,none": 0.038424985593952694}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.03514697467862388}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909281}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.04389869956808777}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20851063829787234, "acc_stderr,none": 0.026556982117838718}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02306818884826111}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2709677419354839, "acc_stderr,none": 0.025284416114900152}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.031947400722655395}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.027309140588230175}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2980132450331126, "acc_stderr,none": 0.037345356767871984}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.38425925925925924, "acc_stderr,none": 0.03317354514310742}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.16071428571428573, "acc_stderr,none": 0.0348594609647574}, "mmlu_pro": {"exact_match,custom-extract": 0.10081449468085106, "exact_match_stderr,custom-extract": 0.0027309459729920847, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13528591352859135, "exact_match_stderr,custom-extract": 0.012782212846937785}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10139416983523447, "exact_match_stderr,custom-extract": 0.010752959229023343}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.044169611307420496, "exact_match_stderr,custom-extract": 0.006109714311601104}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11707317073170732, "exact_match_stderr,custom-extract": 0.01589752048346171}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.13033175355450238, "exact_match_stderr,custom-extract": 0.011595464178953758}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08152734778121776, "exact_match_stderr,custom-extract": 0.008795227818605757}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13202933985330073, "exact_match_stderr,custom-extract": 0.011843408801240358}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13385826771653545, "exact_match_stderr,custom-extract": 0.017467280079326592}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08719346049046321, "exact_match_stderr,custom-extract": 0.008506188171943508}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06365655070318282, "exact_match_stderr,custom-extract": 0.006644652222081492}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13636363636363635, "exact_match_stderr,custom-extract": 0.011295719428226613}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10821643286573146, "exact_match_stderr,custom-extract": 0.013920719044718397}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08468052347959969, "exact_match_stderr,custom-extract": 0.007727531295642343}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14786967418546365, "exact_match_stderr,custom-extract": 0.012573709084942281}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.236, "acc_stderr,none": 0.019008699622084728, "acc_norm,none": 0.364, "acc_norm_stderr,none": 0.021539170637317695}, "piqa": {"alias": "piqa", "acc,none": 0.6626768226332971, "acc_stderr,none": 0.011031114785059696, "acc_norm,none": 0.6713819368879217, "acc_norm_stderr,none": 0.010959127105167044}, "race": {"alias": "race", "acc,none": 0.3492822966507177, "acc_stderr,none": 0.014754834713104495}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.417093142272262, "acc_stderr,none": 0.011157450926787525}, "winogrande": {"alias": "winogrande", "acc,none": 0.590370955011839, "acc_stderr,none": 0.013821049109655472}} {"created_at": "2025-04-19T17:09:11.984164", "global_step": 134000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.28668941979522183, "acc_stderr,none": 0.013214986329274767, "acc_norm,none": 0.3310580204778157, "acc_norm_stderr,none": 0.01375206241981783}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6052188552188552, "acc_stderr,none": 0.010030038935883587, "acc_norm,none": 0.5778619528619529, "acc_norm_stderr,none": 0.010134620524592268}, "boolq": {"alias": "boolq", "acc,none": 0.5785932721712538, "acc_stderr,none": 0.008636344580414682}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2072072072072072, "acc_stderr,none": 0.011603856781422558}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.044619604333847394}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3831905994821749, "acc_stderr,none": 0.004851705504790437, "acc_norm,none": 0.4954192391953794, "acc_norm_stderr,none": 0.004989572002196689}, "mmlu": {"acc,none": 0.25907990314769974, "acc_stderr,none": 0.00367793343516203, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2414452709883103, "acc_stderr,none": 0.006226337443503091, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3492063492063492, "acc_stderr,none": 0.042639068927951315}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3151515151515151, "acc_stderr,none": 0.0362773057502241}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.029331162294251745}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22362869198312235, "acc_stderr,none": 0.02712329820522997}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.15702479338842976, "acc_stderr,none": 0.0332124484254713}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.038935425188248475}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212696}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.1994219653179191, "acc_stderr,none": 0.021511900654252538}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.27262569832402234, "acc_stderr,none": 0.01489339173524962}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2347266881028939, "acc_stderr,none": 0.024071805887677045}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2345679012345679, "acc_stderr,none": 0.023576881744005716}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24119947848761408, "acc_stderr,none": 0.01092649610203495}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.1695906432748538, "acc_stderr,none": 0.028782108105401712}, "mmlu_other": {"acc,none": 0.25040231734792406, "acc_stderr,none": 0.007713528490643746, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3283018867924528, "acc_stderr,none": 0.02890159361241178}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.34104046242774566, "acc_stderr,none": 0.036146654241808254}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653697}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.14798206278026907, "acc_stderr,none": 0.023831557157613526}, "mmlu_management": {"alias": " - management", "acc,none": 0.4174757281553398, "acc_stderr,none": 0.048828405482122375}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2094017094017094, "acc_stderr,none": 0.026655699653922747}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.21839080459770116, "acc_stderr,none": 0.014774358319934504}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.02609016250427904}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.026011992930902002}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2757352941176471, "acc_stderr,none": 0.02714627193662517}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.03175554786629921}, "mmlu_social_sciences": {"acc,none": 0.2804679883002925, "acc_stderr,none": 0.008056393628141, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.03775205013583639}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.31313131313131315, "acc_stderr,none": 0.033042050878136525}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.33678756476683935, "acc_stderr,none": 0.03410780251836184}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.31025641025641026, "acc_stderr,none": 0.02345467488940429}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.27310924369747897, "acc_stderr,none": 0.02894200404099817}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3229357798165138, "acc_stderr,none": 0.020048115923415342}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.0364129708131373}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2107843137254902, "acc_stderr,none": 0.016500472979024773}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3673469387755102, "acc_stderr,none": 0.030862144921087558}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_stem": {"acc,none": 0.2730732635585157, "acc_stderr,none": 0.007898864174696046, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.035914440841969694}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3355263157894737, "acc_stderr,none": 0.038424985593952694}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.037455547914624576}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04690650298201942}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.027678452578212387}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.022930973071633345}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3032258064516129, "acc_stderr,none": 0.026148685930671746}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.031089826002937523}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.14, "acc_stderr,none": 0.03487350880197769}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.027309140588230172}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389023}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4027777777777778, "acc_stderr,none": 0.03344887382997866}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.20535714285714285, "acc_stderr,none": 0.03834241021419073}, "mmlu_pro": {"exact_match,custom-extract": 0.09449800531914894, "exact_match_stderr,custom-extract": 0.00266128887073379, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.08926080892608089, "exact_match_stderr,custom-extract": 0.010655428293467986}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09759188846641319, "exact_match_stderr,custom-extract": 0.010571710152486615}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05653710247349823, "exact_match_stderr,custom-extract": 0.006867487601519862}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1024390243902439, "exact_match_stderr,custom-extract": 0.014993500684238468}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12677725118483413, "exact_match_stderr,custom-extract": 0.011459598419118571}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0825593395252838, "exact_match_stderr,custom-extract": 0.008845745053999928}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10146699266503667, "exact_match_stderr,custom-extract": 0.01056375654506443}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12335958005249344, "exact_match_stderr,custom-extract": 0.016869623436798514}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06902815622161672, "exact_match_stderr,custom-extract": 0.007643373236177104}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08216136195410807, "exact_match_stderr,custom-extract": 0.007473948460954437}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11255411255411256, "exact_match_stderr,custom-extract": 0.010402812578120452}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.015561893867712503}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09930715935334873, "exact_match_stderr,custom-extract": 0.008301207861994725}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10401002506265664, "exact_match_stderr,custom-extract": 0.0108133438958395}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.23, "acc_stderr,none": 0.018839050391123137, "acc_norm,none": 0.34, "acc_norm_stderr,none": 0.021206117013673066}, "piqa": {"alias": "piqa", "acc,none": 0.6637649619151251, "acc_stderr,none": 0.01102234670897024, "acc_norm,none": 0.6583242655059848, "acc_norm_stderr,none": 0.01106553514384152}, "race": {"alias": "race", "acc,none": 0.3253588516746411, "acc_stderr,none": 0.014499982471636879}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4002047082906858, "acc_stderr,none": 0.01108642443878967}, "winogrande": {"alias": "winogrande", "acc,none": 0.5935280189423836, "acc_stderr,none": 0.01380444869775338}} {"created_at": "2025-04-19T17:11:36.937875", "global_step": 132000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3003412969283277, "acc_stderr,none": 0.013395909309956999, "acc_norm,none": 0.3395904436860068, "acc_norm_stderr,none": 0.013839039762820167}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6220538720538721, "acc_stderr,none": 0.009949405744045467, "acc_norm,none": 0.6026936026936027, "acc_norm_stderr,none": 0.010041053078884277}, "boolq": {"alias": "boolq", "acc,none": 0.40672782874617736, "acc_stderr,none": 0.00859154902290091}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19656019656019655, "acc_stderr,none": 0.011377439773963998}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38408683529177456, "acc_stderr,none": 0.00485384575039215, "acc_norm,none": 0.4896434973112926, "acc_norm_stderr,none": 0.004988710917169328}, "mmlu": {"acc,none": 0.26427859279304944, "acc_stderr,none": 0.003716479863066855, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2592986184909671, "acc_stderr,none": 0.006388896617847207, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604674}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.0340150671524904}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.0313217980308329}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.02917868230484256}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.34710743801652894, "acc_stderr,none": 0.04345724570292535}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.044531975073749834}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.02361867831006937}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2681564245810056, "acc_stderr,none": 0.014816119635317008}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21221864951768488, "acc_stderr,none": 0.023222756797435108}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22839506172839505, "acc_stderr,none": 0.023358211840626267}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.258148631029987, "acc_stderr,none": 0.0111769237193134}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.26900584795321636, "acc_stderr,none": 0.0340105262010409}, "mmlu_other": {"acc,none": 0.26778242677824265, "acc_stderr,none": 0.007912673829129042, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24528301886792453, "acc_stderr,none": 0.0264803571798957}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.032147373020294696}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.27802690582959644, "acc_stderr,none": 0.030069584874494026}, "mmlu_management": {"alias": " - management", "acc,none": 0.20388349514563106, "acc_stderr,none": 0.039891398595317706}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.029058588303748842}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24521072796934865, "acc_stderr,none": 0.01538435228454394}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.026336613469046633}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729896}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.39705882352941174, "acc_stderr,none": 0.029722152099280055}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.035509201856896294}, "mmlu_social_sciences": {"acc,none": 0.2677933051673708, "acc_stderr,none": 0.00797506775521081, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3684210526315789, "acc_stderr,none": 0.04537815354939391}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.31313131313131315, "acc_stderr,none": 0.033042050878136525}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.26424870466321243, "acc_stderr,none": 0.03182155050916647}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2846153846153846, "acc_stderr,none": 0.022878322799706297}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25210084033613445, "acc_stderr,none": 0.028205545033277733}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27339449541284405, "acc_stderr,none": 0.019109299846098295}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.32061068702290074, "acc_stderr,none": 0.04093329229834278}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2434640522875817, "acc_stderr,none": 0.017362473762146627}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.041220665028782834}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24081632653061225, "acc_stderr,none": 0.027372942201788163}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.208955223880597, "acc_stderr,none": 0.028748298931728655}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_stem": {"acc,none": 0.26482714874722485, "acc_stderr,none": 0.007854428006030955, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.03853254836552003}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.0378272898086547}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.0358687928008034}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322716}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.04793724854411021}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.04488482852329017}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.19148936170212766, "acc_stderr,none": 0.025722149992637795}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727773}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25132275132275134, "acc_stderr,none": 0.022340482339643895}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2709677419354839, "acc_stderr,none": 0.025284416114900156}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.03178529710642749}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.027080372815145654}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.32450331125827814, "acc_stderr,none": 0.03822746937658752}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2824074074074074, "acc_stderr,none": 0.030701372111510934}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.19642857142857142, "acc_stderr,none": 0.03770970049347019}, "mmlu_pro": {"exact_match,custom-extract": 0.09092420212765957, "exact_match_stderr,custom-extract": 0.002611579557836141, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09483960948396095, "exact_match_stderr,custom-extract": 0.010949672704790067}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12040557667934093, "exact_match_stderr,custom-extract": 0.011593149221103011}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.04858657243816254, "exact_match_stderr,custom-extract": 0.0063930995498271675}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11951219512195121, "exact_match_stderr,custom-extract": 0.016040065235546762}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11492890995260663, "exact_match_stderr,custom-extract": 0.010984743847800618}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07223942208462332, "exact_match_stderr,custom-extract": 0.008320844580110062}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11124694376528117, "exact_match_stderr,custom-extract": 0.011000782283753598}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09973753280839895, "exact_match_stderr,custom-extract": 0.015371706524248085}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07720254314259764, "exact_match_stderr,custom-extract": 0.00804771624719557}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05921539600296077, "exact_match_stderr,custom-extract": 0.006423852131454628}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11147186147186147, "exact_match_stderr,custom-extract": 0.010358988935082428}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1462925851703407, "exact_match_stderr,custom-extract": 0.015836201263905454}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07775211701308699, "exact_match_stderr,custom-extract": 0.007432631448995907}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11152882205513784, "exact_match_stderr,custom-extract": 0.011150287588558724}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.228, "acc_stderr,none": 0.018781306529363204, "acc_norm,none": 0.348, "acc_norm_stderr,none": 0.0213237286328075}, "piqa": {"alias": "piqa", "acc,none": 0.6648531011969532, "acc_stderr,none": 0.011013513128643937, "acc_norm,none": 0.6594124047878128, "acc_norm_stderr,none": 0.011057027540404739}, "race": {"alias": "race", "acc,none": 0.3435406698564593, "acc_stderr,none": 0.014697475413671399}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4083930399181167, "acc_stderr,none": 0.011122558066098066}, "winogrande": {"alias": "winogrande", "acc,none": 0.601420678768745, "acc_stderr,none": 0.01376035717687384}} {"created_at": "2025-04-19T17:14:11.233978", "global_step": 130000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.29948805460750855, "acc_stderr,none": 0.013385021637313565, "acc_norm,none": 0.3506825938566553, "acc_norm_stderr,none": 0.01394463593072609}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6207912457912458, "acc_stderr,none": 0.009955891668865565, "acc_norm,none": 0.5656565656565656, "acc_norm_stderr,none": 0.010170943451269425}, "boolq": {"alias": "boolq", "acc,none": 0.6226299694189602, "acc_stderr,none": 0.008477957863309996}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2153972153972154, "acc_stderr,none": 0.011769690686226967}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.04725815626252607}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3836885082652858, "acc_stderr,none": 0.004852896681736762, "acc_norm,none": 0.4971121290579566, "acc_norm_stderr,none": 0.004989698183207837}, "mmlu": {"acc,none": 0.25651616578834924, "acc_stderr,none": 0.003684682926611866, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25674814027630183, "acc_stderr,none": 0.006368417858239969, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.19047619047619047, "acc_stderr,none": 0.035122074123020534}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.028867431449849313}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25316455696202533, "acc_stderr,none": 0.0283046579430353}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.30578512396694213, "acc_stderr,none": 0.04205953933884124}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.041331194402438376}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25766871165644173, "acc_stderr,none": 0.03436150827846917}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.28938906752411575, "acc_stderr,none": 0.025755865922632945}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2808641975308642, "acc_stderr,none": 0.02500646975579921}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2542372881355932, "acc_stderr,none": 0.011121129007840678}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824563}, "mmlu_other": {"acc,none": 0.2594142259414226, "acc_stderr,none": 0.007858659302693206, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106744}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.03435568056047873}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2914798206278027, "acc_stderr,none": 0.030500283176545906}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026622}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.02891120880274948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2681992337164751, "acc_stderr,none": 0.015842430835269438}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.025160998214292456}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.026469036818590634}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20220588235294118, "acc_stderr,none": 0.024398192986654924}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.0332939411907353}, "mmlu_social_sciences": {"acc,none": 0.24244393890152746, "acc_stderr,none": 0.007727792177441534, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2676767676767677, "acc_stderr,none": 0.03154449888270285}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.030276909945178256}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.02102067268082791}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.026841514322958945}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22935779816513763, "acc_stderr,none": 0.018025349724618684}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.035954616117746904}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27941176470588236, "acc_stderr,none": 0.018152871051538802}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884601}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2693877551020408, "acc_stderr,none": 0.02840125202902294}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401465}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_stem": {"acc,none": 0.26704725658103395, "acc_stderr,none": 0.007882480124802178, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34814814814814815, "acc_stderr,none": 0.041153246103369526}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137283}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.02767845257821239}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.022930973071633356}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.267741935483871, "acc_stderr,none": 0.025189006660212385}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.031947400722655395}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712163}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.24537037037037038, "acc_stderr,none": 0.029346665094372937}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467762}, "mmlu_pro": {"exact_match,custom-extract": 0.07945478723404255, "exact_match_stderr,custom-extract": 0.002459843926249574, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.07810320781032078, "exact_match_stderr,custom-extract": 0.010028109705912851}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08365019011406843, "exact_match_stderr,custom-extract": 0.0098628176676401}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.053886925795053005, "exact_match_stderr,custom-extract": 0.00671400907004024}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09268292682926829, "exact_match_stderr,custom-extract": 0.014338963443185472}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.08412322274881516, "exact_match_stderr,custom-extract": 0.009560105553845817}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0608875128998968, "exact_match_stderr,custom-extract": 0.007685732170768645}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1136919315403423, "exact_match_stderr,custom-extract": 0.01110570531879303}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.06824146981627296, "exact_match_stderr,custom-extract": 0.012935525502883795}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06267029972752043, "exact_match_stderr,custom-extract": 0.007307700375729609}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0540340488527017, "exact_match_stderr,custom-extract": 0.006153250897322268}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10930735930735931, "exact_match_stderr,custom-extract": 0.01027041003698769}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11823647294589178, "exact_match_stderr,custom-extract": 0.014468953704661754}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08775981524249422, "exact_match_stderr,custom-extract": 0.007853533553054275}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.08771929824561403, "exact_match_stderr,custom-extract": 0.01002034460979316}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.226, "acc_stderr,none": 0.018722956449139922, "acc_norm,none": 0.358, "acc_norm_stderr,none": 0.021461434862859115}, "piqa": {"alias": "piqa", "acc,none": 0.6664853101196954, "acc_stderr,none": 0.011000139592184571, "acc_norm,none": 0.6702937976060935, "acc_norm_stderr,none": 0.010968357083095152}, "race": {"alias": "race", "acc,none": 0.3349282296650718, "acc_stderr,none": 0.014606961503556259}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.3971340839303992, "acc_stderr,none": 0.011072044556731081}, "winogrande": {"alias": "winogrande", "acc,none": 0.584846093133386, "acc_stderr,none": 0.013848684086658583}} {"created_at": "2025-04-19T18:09:23.043359", "global_step": 138000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2781569965870307, "acc_stderr,none": 0.013094469919538802, "acc_norm,none": 0.3267918088737201, "acc_norm_stderr,none": 0.013706665975587333}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5942760942760943, "acc_stderr,none": 0.01007575554012887, "acc_norm,none": 0.5656565656565656, "acc_norm_stderr,none": 0.010170943451269423}, "boolq": {"alias": "boolq", "acc,none": 0.6376146788990825, "acc_stderr,none": 0.00840730865586405}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20638820638820637, "acc_stderr,none": 0.011586881879177844}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38279227245568614, "acc_stderr,none": 0.004850748687859942, "acc_norm,none": 0.4939255128460466, "acc_norm_stderr,none": 0.004989413158034797}, "mmlu": {"acc,none": 0.24704458054408204, "acc_stderr,none": 0.003634274638440293, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24399574920297556, "acc_stderr,none": 0.006265008342474363, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.19047619047619047, "acc_stderr,none": 0.035122074123020534}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.032568666616811015}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591362}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2869198312236287, "acc_stderr,none": 0.029443773022594693}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302872}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.033220157957767414}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23410404624277456, "acc_stderr,none": 0.022797110278071134}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2540192926045016, "acc_stderr,none": 0.024723861504771686}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2623456790123457, "acc_stderr,none": 0.024477222856135114}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24445893089960888, "acc_stderr,none": 0.010976425013113886}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2046783625730994, "acc_stderr,none": 0.03094445977853321}, "mmlu_other": {"acc,none": 0.2729320888316704, "acc_stderr,none": 0.007974675239307017, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.027495663683724057}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173042}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.37668161434977576, "acc_stderr,none": 0.03252113489929187}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.043546310772605956}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.029480360549541187}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2822477650063857, "acc_stderr,none": 0.01609530296987856}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.024518195641879334}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.026011992930902002}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142314}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.03629335329947859}, "mmlu_social_sciences": {"acc,none": 0.23561910952226195, "acc_stderr,none": 0.00764784875971722, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159395}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.029620227874790465}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803627}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.021444547301560486}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23109243697478993, "acc_stderr,none": 0.027381406927868973}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23486238532110093, "acc_stderr,none": 0.01817511051034357}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596918}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2565359477124183, "acc_stderr,none": 0.017667841612379002}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.04461272175910508}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.17551020408163265, "acc_stderr,none": 0.024352800722970015}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.030147775935409214}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036844}, "mmlu_stem": {"acc,none": 0.23723437995559785, "acc_stderr,none": 0.007556869596793137, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.037125378336148665}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.031546980450822305}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2152777777777778, "acc_stderr,none": 0.03437079344106136}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.13, "acc_stderr,none": 0.033799766898963086}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.040233822736177455}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3191489361702128, "acc_stderr,none": 0.03047297336338003}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.03455930201924812}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.02226181769240019}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.02489246917246284}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03144712581678242}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613423}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340456}, "mmlu_pro": {"exact_match,custom-extract": 0.0845246010638298, "exact_match_stderr,custom-extract": 0.0025273499910436702, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1101813110181311, "exact_match_stderr,custom-extract": 0.011701680018357825}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08365019011406843, "exact_match_stderr,custom-extract": 0.009862817667640102}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.04328621908127209, "exact_match_stderr,custom-extract": 0.0060511029106954765}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07317073170731707, "exact_match_stderr,custom-extract": 0.012876769299821081}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1066350710900474, "exact_match_stderr,custom-extract": 0.010630426613851848}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07120743034055728, "exact_match_stderr,custom-extract": 0.00826578956126575}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1100244498777506, "exact_match_stderr,custom-extract": 0.010947693055603884}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10761154855643044, "exact_match_stderr,custom-extract": 0.015896979452723368}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06357856494096276, "exact_match_stderr,custom-extract": 0.007356897259839129}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06217616580310881, "exact_match_stderr,custom-extract": 0.006572123520013911}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09632034632034632, "exact_match_stderr,custom-extract": 0.009711032762719356}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12024048096192384, "exact_match_stderr,custom-extract": 0.014574466566661982}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07467282525019246, "exact_match_stderr,custom-extract": 0.007296113874717238}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12907268170426064, "exact_match_stderr,custom-extract": 0.011876239922954071}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.216, "acc_stderr,none": 0.01842190906141194, "acc_norm,none": 0.322, "acc_norm_stderr,none": 0.020916668330019882}, "piqa": {"alias": "piqa", "acc,none": 0.6692056583242655, "acc_stderr,none": 0.010977520584714434, "acc_norm,none": 0.6692056583242655, "acc_norm_stderr,none": 0.010977520584714436}, "race": {"alias": "race", "acc,none": 0.33588516746411484, "acc_stderr,none": 0.014617286312430684}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4068577277379734, "acc_stderr,none": 0.011116027212644338}, "winogrande": {"alias": "winogrande", "acc,none": 0.5777426992896606, "acc_stderr,none": 0.013881582030658557}} {"created_at": "2025-04-19T19:58:53.764070", "global_step": 140000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.31399317406143346, "acc_stderr,none": 0.013562691224726291, "acc_norm,none": 0.35238907849829354, "acc_norm_stderr,none": 0.013960142600598684}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6393097643097643, "acc_stderr,none": 0.009853512108416737, "acc_norm,none": 0.6174242424242424, "acc_norm_stderr,none": 0.009972837790531477}, "boolq": {"alias": "boolq", "acc,none": 0.6033639143730887, "acc_stderr,none": 0.008556148582032}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19574119574119575, "acc_stderr,none": 0.011359497363584395}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38408683529177456, "acc_stderr,none": 0.004853845750392155, "acc_norm,none": 0.4965146385182235, "acc_norm_stderr,none": 0.004989660180792159}, "mmlu": {"acc,none": 0.25345392394245836, "acc_stderr,none": 0.0036703597670141933, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2541976620616366, "acc_stderr,none": 0.006351130759838543, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.0393253768039287}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.23628691983122363, "acc_stderr,none": 0.027652153144159263}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2809917355371901, "acc_stderr,none": 0.041032038305145124}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854933}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.03322015795776741}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.02353292543104428}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2315112540192926, "acc_stderr,none": 0.023956532766639133}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2345679012345679, "acc_stderr,none": 0.023576881744005723}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2653194263363755, "acc_stderr,none": 0.01127619884395888}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824563}, "mmlu_other": {"acc,none": 0.2658513035082073, "acc_stderr,none": 0.007914960467870515, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106734}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483099}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.30493273542600896, "acc_stderr,none": 0.030898610882477515}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.041858325989283136}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3247863247863248, "acc_stderr,none": 0.03067902276549883}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.280970625798212, "acc_stderr,none": 0.01607312785122124}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.025261691219729477}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.02601199293090201}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.023886881922440338}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.26506024096385544, "acc_stderr,none": 0.03436024037944967}, "mmlu_social_sciences": {"acc,none": 0.2512187195320117, "acc_stderr,none": 0.007822055354274398, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.042270544512322}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2676767676767677, "acc_stderr,none": 0.031544498882702866}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24352331606217617, "acc_stderr,none": 0.03097543638684542}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2512820512820513, "acc_stderr,none": 0.021992016662370568}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279483}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24770642201834864, "acc_stderr,none": 0.018508143602547815}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467766}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2565359477124183, "acc_stderr,none": 0.01766784161237899}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.044942908662520896}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2163265306122449, "acc_stderr,none": 0.026358916334904055}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.21393034825870647, "acc_stderr,none": 0.02899690969332891}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.24230891214716144, "acc_stderr,none": 0.007633363084440964, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322674}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066654}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.0355418036802569}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.04158307533083286}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23829787234042554, "acc_stderr,none": 0.02785125297388978}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2328042328042328, "acc_stderr,none": 0.021765961672154523}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24193548387096775, "acc_stderr,none": 0.0243625996930311}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.17733990147783252, "acc_stderr,none": 0.026874337276808352}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21851851851851853, "acc_stderr,none": 0.02519575225182379}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360383}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.027920963147993662}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25, "acc_stderr,none": 0.04109974682633932}, "mmlu_pro": {"exact_match,custom-extract": 0.0999002659574468, "exact_match_stderr,custom-extract": 0.002724750732382453, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13110181311018132, "exact_match_stderr,custom-extract": 0.012613403336459914}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10012674271229405, "exact_match_stderr,custom-extract": 0.010693074879962131}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07243816254416961, "exact_match_stderr,custom-extract": 0.007707683029020954}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0951219512195122, "exact_match_stderr,custom-extract": 0.014506870947377851}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.09360189573459715, "exact_match_stderr,custom-extract": 0.01003201168369}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07223942208462332, "exact_match_stderr,custom-extract": 0.008320844580110065}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11124694376528117, "exact_match_stderr,custom-extract": 0.011000782283753577}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14960629921259844, "exact_match_stderr,custom-extract": 0.018297559115940484}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10717529518619437, "exact_match_stderr,custom-extract": 0.009326830860379758}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05847520355292376, "exact_match_stderr,custom-extract": 0.0063860875475300095}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12554112554112554, "exact_match_stderr,custom-extract": 0.010905908590641358}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13226452905811623, "exact_match_stderr,custom-extract": 0.015181011139551248}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09160892994611239, "exact_match_stderr,custom-extract": 0.008006966049388345}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14160401002506265, "exact_match_stderr,custom-extract": 0.012349587610134341}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.256, "acc_stderr,none": 0.0195369235747476, "acc_norm,none": 0.354, "acc_norm_stderr,none": 0.021407582047916447}, "piqa": {"alias": "piqa", "acc,none": 0.6594124047878128, "acc_stderr,none": 0.011057027540404739, "acc_norm,none": 0.6692056583242655, "acc_norm_stderr,none": 0.01097752058471444}, "race": {"alias": "race", "acc,none": 0.3263157894736842, "acc_stderr,none": 0.014510987877134937}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40583418628454454, "acc_stderr,none": 0.011111610832965834}, "winogrande": {"alias": "winogrande", "acc,none": 0.5777426992896606, "acc_stderr,none": 0.013881582030658557}} {"created_at": "2025-04-19T21:39:07.044394", "global_step": 142000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.31143344709897613, "acc_stderr,none": 0.013532472099850949, "acc_norm,none": 0.3609215017064846, "acc_norm_stderr,none": 0.01403476138617546}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6346801346801347, "acc_stderr,none": 0.009880576614806926, "acc_norm,none": 0.5946969696969697, "acc_norm_stderr,none": 0.010074093589739194}, "boolq": {"alias": "boolq", "acc,none": 0.4840978593272171, "acc_stderr,none": 0.008740630991997792}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2203112203112203, "acc_stderr,none": 0.011865854943402442}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.046882617226215034}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3858793069109739, "acc_stderr,none": 0.004858074013443991, "acc_norm,none": 0.5008962358095996, "acc_norm_stderr,none": 0.004989773395468894}, "mmlu": {"acc,none": 0.24854009400370317, "acc_stderr,none": 0.0036428805202809865, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2507970244420829, "acc_stderr,none": 0.006324661181863029, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.036196045241242494}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03477691162163659}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693257}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.029178682304842555}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516302}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.03351953879521269}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.022894082489925992}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.27009646302250806, "acc_stderr,none": 0.02521804037341062}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.023788583551658533}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2529335071707953, "acc_stderr,none": 0.01110226871383999}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946956}, "mmlu_other": {"acc,none": 0.2661731573865465, "acc_stderr,none": 0.007907953803387787, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2679245283018868, "acc_stderr,none": 0.027257260322494845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.0309528902177499}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.336322869955157, "acc_stderr,none": 0.031708824268455}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.044986763205729224}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23931623931623933, "acc_stderr,none": 0.02795182680892433}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2796934865900383, "acc_stderr,none": 0.016050792148036522}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.024170840879341016}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.35542168674698793, "acc_stderr,none": 0.03726214354322415}, "mmlu_social_sciences": {"acc,none": 0.24049398765030874, "acc_stderr,none": 0.007700935783829868, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21717171717171718, "acc_stderr,none": 0.029376616484945644}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2230769230769231, "acc_stderr,none": 0.021107730127244}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.0275536144678638}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23486238532110093, "acc_stderr,none": 0.01817511051034357}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.0359546161177469}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.017952449196987866}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.35454545454545455, "acc_stderr,none": 0.04582004841505416}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24489795918367346, "acc_stderr,none": 0.027529637440174923}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22388059701492538, "acc_stderr,none": 0.02947525023601718}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_stem": {"acc,none": 0.2356485886457342, "acc_stderr,none": 0.007544952997828368, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174021}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17105263157894737, "acc_stderr,none": 0.0306436070716771}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.0358687928008034}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774709}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.31063829787234043, "acc_stderr,none": 0.03025123757921317}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23544973544973544, "acc_stderr,none": 0.021851509822031708}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24516129032258063, "acc_stderr,none": 0.024472243840895535}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2315270935960591, "acc_stderr,none": 0.029678333141444444}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036847}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.02620276653465215}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473837}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18981481481481483, "acc_stderr,none": 0.02674471483469191}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467762}, "mmlu_pro": {"exact_match,custom-extract": 0.09449800531914894, "exact_match_stderr,custom-extract": 0.002652556106218792, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13389121338912133, "exact_match_stderr,custom-extract": 0.012726405288731828}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11660329531051965, "exact_match_stderr,custom-extract": 0.011433262922605285}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.038869257950530034, "exact_match_stderr,custom-extract": 0.005747289272769007}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07317073170731707, "exact_match_stderr,custom-extract": 0.012876769299821067}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.0971563981042654, "exact_match_stderr,custom-extract": 0.010200657784227771}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0804953560371517, "exact_match_stderr,custom-extract": 0.008744292925925108}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14058679706601468, "exact_match_stderr,custom-extract": 0.012160802933047542}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11023622047244094, "exact_match_stderr,custom-extract": 0.016065998434778163}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07538601271571299, "exact_match_stderr,custom-extract": 0.00796029703663129}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06809770540340489, "exact_match_stderr,custom-extract": 0.006856216855671768}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12987012987012986, "exact_match_stderr,custom-extract": 0.011064857512116031}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1282565130260521, "exact_match_stderr,custom-extract": 0.01498371136300156}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07159353348729793, "exact_match_stderr,custom-extract": 0.007155971971815129}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13283208020050125, "exact_match_stderr,custom-extract": 0.012021922607242955}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.24, "acc_stderr,none": 0.019118866653759743, "acc_norm,none": 0.346, "acc_norm_stderr,none": 0.021294951277234637}, "piqa": {"alias": "piqa", "acc,none": 0.6534276387377584, "acc_stderr,none": 0.01110302032087218, "acc_norm,none": 0.6697497279651795, "acc_norm_stderr,none": 0.010972947133006297}, "race": {"alias": "race", "acc,none": 0.3550239234449761, "acc_stderr,none": 0.014809839887617086}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4150460593654043, "acc_stderr,none": 0.0111495633967375}, "winogrande": {"alias": "winogrande", "acc,none": 0.5627466456195738, "acc_stderr,none": 0.013941393310695925}} {"created_at": "2025-04-19T23:35:00.939601", "global_step": 144000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3046075085324232, "acc_stderr,none": 0.01344952210993249, "acc_norm,none": 0.3361774744027304, "acc_norm_stderr,none": 0.013804855026205754}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6443602693602694, "acc_stderr,none": 0.009822854395535483, "acc_norm,none": 0.6186868686868687, "acc_norm_stderr,none": 0.009966542497171018}, "boolq": {"alias": "boolq", "acc,none": 0.527217125382263, "acc_stderr,none": 0.008732089182349662}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2031122031122031, "acc_stderr,none": 0.011518254793634084}, "copa": {"alias": "copa", "acc,none": 0.64, "acc_stderr,none": 0.048241815132442176}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38966341366261703, "acc_stderr,none": 0.0048667723730299265, "acc_norm,none": 0.5007966540529775, "acc_norm_stderr,none": 0.004989775077835656}, "mmlu": {"acc,none": 0.24533542230451502, "acc_stderr,none": 0.00362771251250637, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2312433581296493, "acc_stderr,none": 0.00613868688210758, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.03333333333333338}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.17575757575757575, "acc_stderr,none": 0.02972094300622445}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.03132179803083291}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29535864978902954, "acc_stderr,none": 0.02969633871342289}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1487603305785124, "acc_stderr,none": 0.03248470083807194}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252626}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.294478527607362, "acc_stderr,none": 0.03581165790474082}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.023083658586984204}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24022346368715083, "acc_stderr,none": 0.014288343803925302}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19614147909967847, "acc_stderr,none": 0.02255244778047804}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.022779719088733396}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23142112125162972, "acc_stderr,none": 0.01077146171157646}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.031581495393387345}, "mmlu_other": {"acc,none": 0.25587383327969104, "acc_stderr,none": 0.007825304828822481, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2679245283018868, "acc_stderr,none": 0.02725726032249485}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.03435568056047874}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.028605953702004253}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.22094508301404853, "acc_stderr,none": 0.014836205167333584}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.025360603796242553}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880592}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.02679956202488768}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.22289156626506024, "acc_stderr,none": 0.03240004825594686}, "mmlu_social_sciences": {"acc,none": 0.2512187195320117, "acc_stderr,none": 0.007823327645338496, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.03835153954399421}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.031156269519646836}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.258974358974359, "acc_stderr,none": 0.022211106810061658}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24369747899159663, "acc_stderr,none": 0.027886828078380558}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27339449541284405, "acc_stderr,none": 0.019109299846098292}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.03641297081313729}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.017630827375148383}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644286}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2612244897959184, "acc_stderr,none": 0.02812342933514278}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23383084577114427, "acc_stderr,none": 0.029929415408348377}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_stem": {"acc,none": 0.25023786869647957, "acc_stderr,none": 0.007704837513159296, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.03915450630414251}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.03317672787533158}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03800968060554858}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.04488482852329017}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2127659574468085, "acc_stderr,none": 0.026754391348039766}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.18620689655172415, "acc_stderr,none": 0.032439461590046154}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.02201908001221789}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23548387096774193, "acc_stderr,none": 0.02413763242933771}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.21674876847290642, "acc_stderr,none": 0.02899033125251624}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.026842057873833706}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.32450331125827814, "acc_stderr,none": 0.03822746937658752}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3194444444444444, "acc_stderr,none": 0.031798763421768524}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.1875, "acc_stderr,none": 0.0370468111477387}, "mmlu_pro": {"exact_match,custom-extract": 0.09557845744680851, "exact_match_stderr,custom-extract": 0.002674000133570905, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09902370990237098, "exact_match_stderr,custom-extract": 0.011162713195868411}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09759188846641319, "exact_match_stderr,custom-extract": 0.010571710152486615}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05653710247349823, "exact_match_stderr,custom-extract": 0.006867487601519864}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09268292682926829, "exact_match_stderr,custom-extract": 0.014338963443185472}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10545023696682465, "exact_match_stderr,custom-extract": 0.010578211479944635}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07946336429308566, "exact_match_stderr,custom-extract": 0.008692933034367002}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11735941320293398, "exact_match_stderr,custom-extract": 0.011260038705395159}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11548556430446194, "exact_match_stderr,custom-extract": 0.016395494305781085}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09627611262488647, "exact_match_stderr,custom-extract": 0.008893665915732372}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07031828275351591, "exact_match_stderr,custom-extract": 0.006958800549270529}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11688311688311688, "exact_match_stderr,custom-extract": 0.010575091539720223}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10220440881763528, "exact_match_stderr,custom-extract": 0.013574032292671014}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08929946112394149, "exact_match_stderr,custom-extract": 0.007915436507576855}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14786967418546365, "exact_match_stderr,custom-extract": 0.012573709084942283}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.228, "acc_stderr,none": 0.018781306529363204, "acc_norm,none": 0.334, "acc_norm_stderr,none": 0.021113492347743727}, "piqa": {"alias": "piqa", "acc,none": 0.6572361262241567, "acc_stderr,none": 0.011073978007039314, "acc_norm,none": 0.6686615886833515, "acc_norm_stderr,none": 0.010982077458957348}, "race": {"alias": "race", "acc,none": 0.3397129186602871, "acc_stderr,none": 0.014657914432586402}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40583418628454454, "acc_stderr,none": 0.011111610832965838}, "winogrande": {"alias": "winogrande", "acc,none": 0.5714285714285714, "acc_stderr,none": 0.013908353814606703}} {"created_at": "2025-04-20T01:30:42.898190", "global_step": 146000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.28498293515358364, "acc_stderr,none": 0.013191348179838793, "acc_norm,none": 0.33276450511945393, "acc_norm_stderr,none": 0.013769863046192307}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5984848484848485, "acc_stderr,none": 0.010058790020755572, "acc_norm,none": 0.5627104377104377, "acc_norm_stderr,none": 0.010178768429321585}, "boolq": {"alias": "boolq", "acc,none": 0.6116207951070336, "acc_stderr,none": 0.008524357307908785}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19246519246519248, "acc_stderr,none": 0.011286955409752627}, "copa": {"alias": "copa", "acc,none": 0.65, "acc_stderr,none": 0.047937248544110196}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38478390758812986, "acc_stderr,none": 0.004855498343308384, "acc_norm,none": 0.49790878311093406, "acc_norm_stderr,none": 0.004989737768749953}, "mmlu": {"acc,none": 0.2673408346389403, "acc_stderr,none": 0.0037175918849170126, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24782146652497344, "acc_stderr,none": 0.006293336343197528, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.04073524322147126}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.03401506715249039}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501964}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.21518987341772153, "acc_stderr,none": 0.026750826994676187}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.17355371900826447, "acc_stderr,none": 0.0345727283691767}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.04453197507374983}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.22832369942196531, "acc_stderr,none": 0.022598703804321628}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.26033519553072626, "acc_stderr,none": 0.014676252009319471}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24758842443729903, "acc_stderr,none": 0.024513879973621967}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22530864197530864, "acc_stderr,none": 0.02324620264781975}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26010430247718386, "acc_stderr,none": 0.011204382887823827}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.19883040935672514, "acc_stderr,none": 0.03061111655743253}, "mmlu_other": {"acc,none": 0.25619568715803026, "acc_stderr,none": 0.007763042505978551, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.29056603773584905, "acc_stderr,none": 0.027943219989337145}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.035331333893236574}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.14798206278026907, "acc_stderr,none": 0.023831557157613537}, "mmlu_management": {"alias": " - management", "acc,none": 0.3883495145631068, "acc_stderr,none": 0.0482572933735639}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.02860595370200425}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.20178799489144317, "acc_stderr,none": 0.014351702181636864}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.025360603796242564}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290392}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3897058823529412, "acc_stderr,none": 0.029624663581159696}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.033293941190735296}, "mmlu_social_sciences": {"acc,none": 0.29411764705882354, "acc_stderr,none": 0.008194752894192869, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3434343434343434, "acc_stderr,none": 0.03383201223244441}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.35233160621761656, "acc_stderr,none": 0.03447478286414357}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.26153846153846155, "acc_stderr,none": 0.022282141204204423}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31512605042016806, "acc_stderr,none": 0.030176808288974337}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3394495412844037, "acc_stderr,none": 0.02030210934266235}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2748091603053435, "acc_stderr,none": 0.039153454088478354}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.017952449196987862}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721377}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3346938775510204, "acc_stderr,none": 0.030209235226242307}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.030769444967296024}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_stem": {"acc,none": 0.2813193783698065, "acc_stderr,none": 0.007965887267299236, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774709}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.03547854198560827}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.03761070869867479}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.04617034827006715}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.34, "acc_stderr,none": 0.047609522856952344}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.027678452578212397}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2804232804232804, "acc_stderr,none": 0.02313528797432562}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3032258064516129, "acc_stderr,none": 0.026148685930671746}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.0319474007226554}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.14, "acc_stderr,none": 0.03487350880197774}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.027195934804085622}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33112582781456956, "acc_stderr,none": 0.038425817186598696}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.033247089118091176}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285714}, "mmlu_pro": {"exact_match,custom-extract": 0.08244680851063829, "exact_match_stderr,custom-extract": 0.002496013418937015, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.12691771269177127, "exact_match_stderr,custom-extract": 0.012440338452215536}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09125475285171103, "exact_match_stderr,custom-extract": 0.010258543729935022}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.03356890459363958, "exact_match_stderr,custom-extract": 0.005355780010493135}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07804878048780488, "exact_match_stderr,custom-extract": 0.013264026422963548}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11492890995260663, "exact_match_stderr,custom-extract": 0.010984743847800599}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07223942208462332, "exact_match_stderr,custom-extract": 0.008320844580110072}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10880195599022005, "exact_match_stderr,custom-extract": 0.010894177212595784}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09186351706036745, "exact_match_stderr,custom-extract": 0.014816829983934028}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.05631244323342416, "exact_match_stderr,custom-extract": 0.006950562565230877}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.049592894152479645, "exact_match_stderr,custom-extract": 0.005908778090269242}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09307359307359307, "exact_match_stderr,custom-extract": 0.00956309374740362}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11222444889779559, "exact_match_stderr,custom-extract": 0.014144273960803908}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08160123171670516, "exact_match_stderr,custom-extract": 0.00759847881839772}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11403508771929824, "exact_match_stderr,custom-extract": 0.011258961939273109}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.242, "acc_stderr,none": 0.019173085678337164, "acc_norm,none": 0.332, "acc_norm_stderr,none": 0.021081766571222856}, "piqa": {"alias": "piqa", "acc,none": 0.6664853101196954, "acc_stderr,none": 0.011000139592184575, "acc_norm,none": 0.6583242655059848, "acc_norm_stderr,none": 0.01106553514384152}, "race": {"alias": "race", "acc,none": 0.3435406698564593, "acc_stderr,none": 0.014697475413671399}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.39508700102354144, "acc_stderr,none": 0.011062205128594174}, "winogrande": {"alias": "winogrande", "acc,none": 0.5722178374112076, "acc_stderr,none": 0.013905134013839946}} {"created_at": "2025-04-20T03:15:48.727391", "global_step": 148000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.28498293515358364, "acc_stderr,none": 0.013191348179838793, "acc_norm,none": 0.3361774744027304, "acc_norm_stderr,none": 0.01380485502620576}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6073232323232324, "acc_stderr,none": 0.010020646555538693, "acc_norm,none": 0.6081649831649831, "acc_norm_stderr,none": 0.010016835016834971}, "boolq": {"alias": "boolq", "acc,none": 0.5116207951070336, "acc_stderr,none": 0.008742692742551265}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1891891891891892, "acc_stderr,none": 0.011213159711868611}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38856801433977295, "acc_stderr,none": 0.004864286176731826, "acc_norm,none": 0.5002987452698665, "acc_norm_stderr,none": 0.004989780520782243}, "mmlu": {"acc,none": 0.26463466742629255, "acc_stderr,none": 0.0037074336325129036, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2563230605738576, "acc_stderr,none": 0.006358257941407427, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.35714285714285715, "acc_stderr,none": 0.04285714285714281}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.035243908445117836}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19008264462809918, "acc_stderr,none": 0.03581796951709282}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.038935425188248475}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615769}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.023445826276545543}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.27262569832402234, "acc_stderr,none": 0.01489339173524962}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2282958199356913, "acc_stderr,none": 0.023839303311398212}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22530864197530864, "acc_stderr,none": 0.02324620264781975}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2711864406779661, "acc_stderr,none": 0.011354581451622981}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.1871345029239766, "acc_stderr,none": 0.029913127232368036}, "mmlu_other": {"acc,none": 0.2568393949147087, "acc_stderr,none": 0.007771661103268864, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252606}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2943396226415094, "acc_stderr,none": 0.028049186315695245}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.035506839891655796}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.14, "acc_stderr,none": 0.03487350880197772}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.17488789237668162, "acc_stderr,none": 0.025495284626444965}, "mmlu_management": {"alias": " - management", "acc,none": 0.3300970873786408, "acc_stderr,none": 0.0465614711001235}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23931623931623933, "acc_stderr,none": 0.027951826808924333}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2120051085568327, "acc_stderr,none": 0.014616099385833681}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.025457756696667867}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.025892151156709405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.39338235294117646, "acc_stderr,none": 0.02967428828131118}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.19879518072289157, "acc_stderr,none": 0.03106939026078942}, "mmlu_social_sciences": {"acc,none": 0.27819304517387067, "acc_stderr,none": 0.008060982615360155, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537315}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30808080808080807, "acc_stderr,none": 0.032894773300986155}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3160621761658031, "acc_stderr,none": 0.033553973696861736}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.26153846153846155, "acc_stderr,none": 0.02228214120420442}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2689075630252101, "acc_stderr,none": 0.028801392193631276}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.344954128440367, "acc_stderr,none": 0.020380605405066966}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596919}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.017282760695167435}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724136}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2897959183673469, "acc_stderr,none": 0.02904308868330434}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916707}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.0429234695990928}, "mmlu_stem": {"acc,none": 0.27148747224865205, "acc_stderr,none": 0.007878235366786878, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.25, "acc_stderr,none": 0.03523807393012047}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2847222222222222, "acc_stderr,none": 0.037738099906869355}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3431372549019608, "acc_stderr,none": 0.047240073523838876}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2, "acc_stderr,none": 0.0261488180184245}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02306818884826111}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27741935483870966, "acc_stderr,none": 0.025470196835900055}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2315270935960591, "acc_stderr,none": 0.02967833314144444}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.026842057873833706}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2980132450331126, "acc_stderr,none": 0.037345356767871984}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4166666666666667, "acc_stderr,none": 0.033622774366080424}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "mmlu_pro": {"exact_match,custom-extract": 0.10089760638297872, "exact_match_stderr,custom-extract": 0.002741094643896509, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.12412831241283125, "exact_match_stderr,custom-extract": 0.012322509407061307}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10139416983523447, "exact_match_stderr,custom-extract": 0.01075295922902333}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07420494699646643, "exact_match_stderr,custom-extract": 0.007793679728569136}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11951219512195121, "exact_match_stderr,custom-extract": 0.016040065235546762}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12559241706161137, "exact_match_stderr,custom-extract": 0.011413658642349243}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0825593395252838, "exact_match_stderr,custom-extract": 0.008845745053999925}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12224938875305623, "exact_match_stderr,custom-extract": 0.011460350236485605}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10498687664041995, "exact_match_stderr,custom-extract": 0.01572499120355455}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09627611262488647, "exact_match_stderr,custom-extract": 0.00889366591573237}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07105847520355292, "exact_match_stderr,custom-extract": 0.006992544617386965}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11147186147186147, "exact_match_stderr,custom-extract": 0.01035898893508243}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.01556189386771249}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09006928406466513, "exact_match_stderr,custom-extract": 0.007946120960248708}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11779448621553884, "exact_match_stderr,custom-extract": 0.011418740524805817}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.226, "acc_stderr,none": 0.018722956449139926, "acc_norm,none": 0.328, "acc_norm_stderr,none": 0.02101702716517549}, "piqa": {"alias": "piqa", "acc,none": 0.6626768226332971, "acc_stderr,none": 0.01103111478505969, "acc_norm,none": 0.6681175190424374, "acc_norm_stderr,none": 0.010986617776361582}, "race": {"alias": "race", "acc,none": 0.33588516746411484, "acc_stderr,none": 0.014617286312430682}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40276356192425794, "acc_stderr,none": 0.011098061143371349}, "winogrande": {"alias": "winogrande", "acc,none": 0.5887924230465666, "acc_stderr,none": 0.013829128358676866}} {"created_at": "2025-04-20T05:06:59.139642", "global_step": 150000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2790102389078498, "acc_stderr,none": 0.013106784883601343, "acc_norm,none": 0.3250853242320819, "acc_norm_stderr,none": 0.013688147309729117}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5909090909090909, "acc_stderr,none": 0.010088775152615784, "acc_norm,none": 0.5656565656565656, "acc_norm_stderr,none": 0.010170943451269421}, "boolq": {"alias": "boolq", "acc,none": 0.5596330275229358, "acc_stderr,none": 0.008682635667686902}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18755118755118755, "acc_stderr,none": 0.011175783964114743}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.04725815626252609}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3866759609639514, "acc_stderr,none": 0.004859930926500307, "acc_norm,none": 0.4911372236606254, "acc_norm_stderr,none": 0.004988997467134488}, "mmlu": {"acc,none": 0.2555903717419171, "acc_stderr,none": 0.0036776894008003548, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24420828905419767, "acc_stderr,none": 0.006267350634844606, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.04073524322147124}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.033175059300091805}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650743}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.03351953879521269}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0230836585869842}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.26033519553072626, "acc_stderr,none": 0.014676252009319475}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21543408360128619, "acc_stderr,none": 0.02335022547547143}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23765432098765432, "acc_stderr,none": 0.023683591837008557}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23663624511082137, "acc_stderr,none": 0.010855137351572739}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03218093795602357}, "mmlu_other": {"acc,none": 0.2671387190215642, "acc_stderr,none": 0.007924224432540924, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.027495663683724057}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.0336876293225943}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.30493273542600896, "acc_stderr,none": 0.030898610882477515}, "mmlu_management": {"alias": " - management", "acc,none": 0.2524271844660194, "acc_stderr,none": 0.043012503996908764}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.31196581196581197, "acc_stderr,none": 0.030351527323344944}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24010217113665389, "acc_stderr,none": 0.015274685213734197}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.025360603796242553}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2198581560283688, "acc_stderr,none": 0.024706141070705474}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.02841820861940679}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064536}, "mmlu_social_sciences": {"acc,none": 0.265518362040949, "acc_stderr,none": 0.007964697455314251, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.29292929292929293, "acc_stderr,none": 0.03242497958178817}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.26424870466321243, "acc_stderr,none": 0.031821550509166484}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2794871794871795, "acc_stderr,none": 0.022752388839776823}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176892}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25321100917431194, "acc_stderr,none": 0.018644073041375046}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306085}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2434640522875817, "acc_stderr,none": 0.017362473762146627}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.04172343038705383}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2897959183673469, "acc_stderr,none": 0.029043088683304318}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_stem": {"acc,none": 0.25150650174437045, "acc_stderr,none": 0.007707686524353222, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174023}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19736842105263158, "acc_stderr,none": 0.03238981601699397}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2847222222222222, "acc_stderr,none": 0.03773809990686934}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.04488482852329017}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292333}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.034559302019248145}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24338624338624337, "acc_stderr,none": 0.02210112878741544}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27419354838709675, "acc_stderr,none": 0.025378139970885193}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.16748768472906403, "acc_stderr,none": 0.026273086047535428}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774709}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.02659393910184407}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367746}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.03114144782353604}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.09017619680851063, "exact_match_stderr,custom-extract": 0.0025961652487086102, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.10320781032078104, "exact_match_stderr,custom-extract": 0.011369612924724331}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11533586818757921, "exact_match_stderr,custom-extract": 0.01137910999531721}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06448763250883392, "exact_match_stderr,custom-extract": 0.007303510883881907}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07804878048780488, "exact_match_stderr,custom-extract": 0.013264026422963541}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12085308056872038, "exact_match_stderr,custom-extract": 0.011226536807498511}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.06811145510835913, "exact_match_stderr,custom-extract": 0.008097563963307493}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1393643031784841, "exact_match_stderr,custom-extract": 0.012116422904979631}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11286089238845144, "exact_match_stderr,custom-extract": 0.016232140903461426}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.03451407811080836, "exact_match_stderr,custom-extract": 0.005503953663501524}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.053293856402664694, "exact_match_stderr,custom-extract": 0.00611335037392484}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10822510822510822, "exact_match_stderr,custom-extract": 0.010225646711914635}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1282565130260521, "exact_match_stderr,custom-extract": 0.014983711363001542}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08545034642032333, "exact_match_stderr,custom-extract": 0.00775931195209555}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13157894736842105, "exact_match_stderr,custom-extract": 0.0119737232159004}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.214, "acc_stderr,none": 0.018359797502387, "acc_norm,none": 0.312, "acc_norm_stderr,none": 0.020740596536488076}, "piqa": {"alias": "piqa", "acc,none": 0.6425462459194777, "acc_stderr,none": 0.011181692590867645, "acc_norm,none": 0.6550598476605005, "acc_norm_stderr,none": 0.01109067010299315}, "race": {"alias": "race", "acc,none": 0.32344497607655504, "acc_stderr,none": 0.01447776480941772}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.39815762538382804, "acc_stderr,none": 0.011076888352430779}, "winogrande": {"alias": "winogrande", "acc,none": 0.5572217837411207, "acc_stderr,none": 0.013960157350784975}} {"created_at": "2025-04-20T06:49:14.611405", "global_step": 152000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3054607508532423, "acc_stderr,none": 0.013460080478002505, "acc_norm,none": 0.3677474402730375, "acc_norm_stderr,none": 0.014090995618168484}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6279461279461279, "acc_stderr,none": 0.009918187193096476, "acc_norm,none": 0.6069023569023569, "acc_norm_stderr,none": 0.010022540618945312}, "boolq": {"alias": "boolq", "acc,none": 0.5807339449541284, "acc_stderr,none": 0.008630302070999102}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19492219492219492, "acc_stderr,none": 0.011341478090883527}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.04688261722621504}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38966341366261703, "acc_stderr,none": 0.004866772373029923, "acc_norm,none": 0.5008962358095996, "acc_norm_stderr,none": 0.004989773395468894}, "mmlu": {"acc,none": 0.2503916820965674, "acc_stderr,none": 0.003653376835425342, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2607863974495218, "acc_stderr,none": 0.006400972067893343, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1746031746031746, "acc_stderr,none": 0.033954900208561116}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.0340150671524904}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.029331162294251735}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955924}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3305785123966942, "acc_stderr,none": 0.04294340845212094}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.042365112580946336}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26993865030674846, "acc_stderr,none": 0.03487825168497892}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28034682080924855, "acc_stderr,none": 0.024182427496577615}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2604501607717042, "acc_stderr,none": 0.024926723224845543}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2808641975308642, "acc_stderr,none": 0.025006469755799204}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26597131681877445, "acc_stderr,none": 0.01128503316555128}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23391812865497075, "acc_stderr,none": 0.03246721765117826}, "mmlu_other": {"acc,none": 0.2565175410363695, "acc_stderr,none": 0.007829669629092057, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106723}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641143}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.26905829596412556, "acc_stderr,none": 0.029763779406874972}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.043546310772605956}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24358974358974358, "acc_stderr,none": 0.0281209665039144}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2720306513409962, "acc_stderr,none": 0.015913367447500517}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.025553169991826514}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23049645390070922, "acc_stderr,none": 0.025123739226872402}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20955882352941177, "acc_stderr,none": 0.02472311040767705}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.26506024096385544, "acc_stderr,none": 0.03436024037944967}, "mmlu_social_sciences": {"acc,none": 0.2346441338966526, "acc_stderr,none": 0.007633518524806764, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365907}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.02869787397186068}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.19230769230769232, "acc_stderr,none": 0.0199823472086373}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.027722065493361276}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22385321100917432, "acc_stderr,none": 0.017871217767790215}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.01795244919698787}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.025801283475090503}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036625}, "mmlu_stem": {"acc,none": 0.24421186171899778, "acc_stderr,none": 0.007646175024501159, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34074074074074073, "acc_stderr,none": 0.04094376269996793}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.034597776068105365}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171452}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.251063829787234, "acc_stderr,none": 0.02834696377716246}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25132275132275134, "acc_stderr,none": 0.0223404823396439}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.21935483870967742, "acc_stderr,none": 0.023540799358723302}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.0316185633535861}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22592592592592592, "acc_stderr,none": 0.025497532639609542}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1712962962962963, "acc_stderr,none": 0.025695341643824688}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.08868018617021277, "exact_match_stderr,custom-extract": 0.0025831127609577285, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.08647140864714087, "exact_match_stderr,custom-extract": 0.010503664174021146}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10519645120405577, "exact_match_stderr,custom-extract": 0.010929524923270317}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.046819787985865724, "exact_match_stderr,custom-extract": 0.006281609400208294}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0951219512195122, "exact_match_stderr,custom-extract": 0.014506870947377837}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1018957345971564, "exact_match_stderr,custom-extract": 0.010419037340753811}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0825593395252838, "exact_match_stderr,custom-extract": 0.00884574505399994}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11858190709046455, "exact_match_stderr,custom-extract": 0.011310691771467603}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09448818897637795, "exact_match_stderr,custom-extract": 0.0150052772401423}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.055404178019981834, "exact_match_stderr,custom-extract": 0.00689759873090977}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0695780903034789, "exact_match_stderr,custom-extract": 0.006924833446490216}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12121212121212122, "exact_match_stderr,custom-extract": 0.010742718710399555}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11623246492985972, "exact_match_stderr,custom-extract": 0.014362104240159221}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.0800615858352579, "exact_match_stderr,custom-extract": 0.007532759901424645}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12781954887218044, "exact_match_stderr,custom-extract": 0.011826947082307285}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.204, "acc_stderr,none": 0.018039369104138663, "acc_norm,none": 0.35, "acc_norm_stderr,none": 0.021352091786223104}, "piqa": {"alias": "piqa", "acc,none": 0.6643090315560392, "acc_stderr,none": 0.011017938116656313, "acc_norm,none": 0.675734494015234, "acc_norm_stderr,none": 0.010921539041347964}, "race": {"alias": "race", "acc,none": 0.33875598086124403, "acc_stderr,none": 0.014647857789710098}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41146366427840325, "acc_stderr,none": 0.0111352831165402}, "winogrande": {"alias": "winogrande", "acc,none": 0.5872138910812944, "acc_stderr,none": 0.013837060648682089}} {"created_at": "2025-04-20T08:42:32.673650", "global_step": 154000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.28924914675767915, "acc_stderr,none": 0.013250012579393443, "acc_norm,none": 0.34044368600682595, "acc_norm_stderr,none": 0.01384746051889298}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6191077441077442, "acc_stderr,none": 0.009964428212260372, "acc_norm,none": 0.6077441077441077, "acc_norm_stderr,none": 0.010018744689650043}, "boolq": {"alias": "boolq", "acc,none": 0.39877675840978594, "acc_stderr,none": 0.008563973987729913}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20884520884520885, "acc_stderr,none": 0.011637590576063077}, "copa": {"alias": "copa", "acc,none": 0.64, "acc_stderr,none": 0.04824181513244218}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38418641704839673, "acc_stderr,none": 0.004854082479916912, "acc_norm,none": 0.5003983270264888, "acc_norm_stderr,none": 0.0049897798280438485}, "mmlu": {"acc,none": 0.24939467312348668, "acc_stderr,none": 0.003650464532534411, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2501594048884166, "acc_stderr,none": 0.006314086007892061, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03346409881055953}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604257}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2869198312236287, "acc_stderr,none": 0.02944377302259469}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.04139112727635464}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.04133119440243839}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615768}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.023357365785874037}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18971061093247588, "acc_stderr,none": 0.02226819625878321}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02438366553103545}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25945241199478486, "acc_stderr,none": 0.011195262076350309}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.03488647713457921}, "mmlu_other": {"acc,none": 0.2642420341165111, "acc_stderr,none": 0.007901599867760623, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27169811320754716, "acc_stderr,none": 0.027377706624670713}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3542600896860987, "acc_stderr,none": 0.03210062154134987}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.040580420156460344}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02934311479809446}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2503192848020434, "acc_stderr,none": 0.01549108895149458}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.0242886194660461}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880592}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22058823529411764, "acc_stderr,none": 0.025187786660227262}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.30120481927710846, "acc_stderr,none": 0.0357160923005348}, "mmlu_social_sciences": {"acc,none": 0.2430939226519337, "acc_stderr,none": 0.007741525743513206, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.0298575156733864}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.03003114797764154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24358974358974358, "acc_stderr,none": 0.02176373368417393}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2689075630252101, "acc_stderr,none": 0.028801392193631273}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23119266055045873, "acc_stderr,none": 0.018075750241633156}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.01728276069516741}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.043091187099464585}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22040816326530613, "acc_stderr,none": 0.02653704531214531}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768081}, "mmlu_stem": {"acc,none": 0.23977164605137963, "acc_stderr,none": 0.007604186275453181, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066652}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23026315789473684, "acc_stderr,none": 0.03426059424403165}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237654}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.036001056927277716}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.022019080012217897}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1967741935483871, "acc_stderr,none": 0.022616409420742004}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.19704433497536947, "acc_stderr,none": 0.027986724666736212}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.027420019350945284}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.035118075718047245}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422263}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3392857142857143, "acc_stderr,none": 0.04493949068613539}, "mmlu_pro": {"exact_match,custom-extract": 0.0984873670212766, "exact_match_stderr,custom-extract": 0.0027053234183410427, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11297071129707113, "exact_match_stderr,custom-extract": 0.011830290263473046}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08871989860583017, "exact_match_stderr,custom-extract": 0.01012915817962784}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05742049469964664, "exact_match_stderr,custom-extract": 0.006917690995369924}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1048780487804878, "exact_match_stderr,custom-extract": 0.015150318019731044}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1481042654028436, "exact_match_stderr,custom-extract": 0.012233851872561222}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07636738906088751, "exact_match_stderr,custom-extract": 0.008536226336689337}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12469437652811736, "exact_match_stderr,custom-extract": 0.011558254824072245}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14960629921259844, "exact_match_stderr,custom-extract": 0.018297559115940498}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07266121707538602, "exact_match_stderr,custom-extract": 0.007826619182374834}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06365655070318282, "exact_match_stderr,custom-extract": 0.0066446522220815}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12445887445887446, "exact_match_stderr,custom-extract": 0.010865516089885905}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1342685370741483, "exact_match_stderr,custom-extract": 0.015277913884522445}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1031562740569669, "exact_match_stderr,custom-extract": 0.008442457140721601}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10776942355889724, "exact_match_stderr,custom-extract": 0.010983915176434654}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.246, "acc_stderr,none": 0.019279819056352558, "acc_norm,none": 0.334, "acc_norm_stderr,none": 0.021113492347743727}, "piqa": {"alias": "piqa", "acc,none": 0.6692056583242655, "acc_stderr,none": 0.01097752058471444, "acc_norm,none": 0.6632208922742111, "acc_norm_stderr,none": 0.01102673892525118}, "race": {"alias": "race", "acc,none": 0.33875598086124403, "acc_stderr,none": 0.014647857789710093}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4140225179119754, "acc_stderr,none": 0.011145545345176117}, "winogrande": {"alias": "winogrande", "acc,none": 0.584846093133386, "acc_stderr,none": 0.013848684086658585}} {"created_at": "2025-04-20T10:24:48.755350", "global_step": 156000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.30802047781569963, "acc_stderr,none": 0.01349142951729204, "acc_norm,none": 0.3447098976109215, "acc_norm_stderr,none": 0.013888816286782112}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6258417508417509, "acc_stderr,none": 0.009929516948977625, "acc_norm,none": 0.6106902356902357, "acc_norm_stderr,none": 0.01000521278287814}, "boolq": {"alias": "boolq", "acc,none": 0.537920489296636, "acc_stderr,none": 0.00871986856715964}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19983619983619982, "acc_stderr,none": 0.011448447996728393}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3857797251543517, "acc_stderr,none": 0.004857840934549159, "acc_norm,none": 0.49741087432782316, "acc_norm_stderr,none": 0.004989714512282403}, "mmlu": {"acc,none": 0.2555903717419171, "acc_stderr,none": 0.0036758534540346846, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.273538788522848, "acc_stderr,none": 0.0064940139630113285, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.03455071019102148}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624335}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.030964517926923403}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658335}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.371900826446281, "acc_stderr,none": 0.04412015806624504}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252628}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.036230899157241474}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28901734104046245, "acc_stderr,none": 0.024405173935783238}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2990353697749196, "acc_stderr,none": 0.02600330111788513}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.30246913580246915, "acc_stderr,none": 0.02555765398186805}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2711864406779661, "acc_stderr,none": 0.011354581451622985}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.035087719298245626}, "mmlu_other": {"acc,none": 0.2417122626327647, "acc_stderr,none": 0.007667042623731389, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891363}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2062780269058296, "acc_stderr,none": 0.02715715047956382}, "mmlu_management": {"alias": " - management", "acc,none": 0.20388349514563106, "acc_stderr,none": 0.03989139859531772}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623101}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2681992337164751, "acc_stderr,none": 0.015842430835269438}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.0248480182638752}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16911764705882354, "acc_stderr,none": 0.022770868010113025}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.03141784291663925}, "mmlu_social_sciences": {"acc,none": 0.23366915827104323, "acc_stderr,none": 0.007632551872387549, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.030532892233932036}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.03027690994517826}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551986}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275882}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22568807339449543, "acc_stderr,none": 0.017923087667803057}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.0359546161177469}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.017986615304030326}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.038950910157241364}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24081632653061225, "acc_stderr,none": 0.027372942201788163}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401464}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_stem": {"acc,none": 0.2638756739613067, "acc_stderr,none": 0.007843677285278997, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04072314811876837}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.037385206761196686}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653696}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2170212765957447, "acc_stderr,none": 0.026947483121496224}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.02286083830923207}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24838709677419354, "acc_stderr,none": 0.024580028921481006}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.02646611753895991}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2175925925925926, "acc_stderr,none": 0.02813968944485967}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "mmlu_pro": {"exact_match,custom-extract": 0.10147938829787234, "exact_match_stderr,custom-extract": 0.0027467225534453525, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1199442119944212, "exact_match_stderr,custom-extract": 0.012141944360521243}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10899873257287707, "exact_match_stderr,custom-extract": 0.011101630697795397}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.08303886925795052, "exact_match_stderr,custom-extract": 0.008205118814321247}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08536585365853659, "exact_match_stderr,custom-extract": 0.013816694190586966}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14218009478672985, "exact_match_stderr,custom-extract": 0.012028283958485726}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08668730650154799, "exact_match_stderr,custom-extract": 0.00904377653422991}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12224938875305623, "exact_match_stderr,custom-extract": 0.011460350236485622}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11286089238845144, "exact_match_stderr,custom-extract": 0.01623214090346142}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07356948228882834, "exact_match_stderr,custom-extract": 0.00787152599070514}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06513693560325684, "exact_match_stderr,custom-extract": 0.0067161560447467315}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12012987012987013, "exact_match_stderr,custom-extract": 0.010701235964726306}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11022044088176353, "exact_match_stderr,custom-extract": 0.014033229017364514}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10931485758275597, "exact_match_stderr,custom-extract": 0.008660926526693489}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12030075187969924, "exact_match_stderr,custom-extract": 0.01152317401995648}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.216, "acc_stderr,none": 0.018421909061411935, "acc_norm,none": 0.348, "acc_norm_stderr,none": 0.0213237286328075}, "piqa": {"alias": "piqa", "acc,none": 0.6599564744287268, "acc_stderr,none": 0.011052749414423546, "acc_norm,none": 0.6610446137105549, "acc_norm_stderr,none": 0.011044144419710645}, "race": {"alias": "race", "acc,none": 0.3464114832535885, "acc_stderr,none": 0.014726451021782805}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4124872057318321, "acc_stderr,none": 0.01113942525917073}, "winogrande": {"alias": "winogrande", "acc,none": 0.6037884767166535, "acc_stderr,none": 0.013746404157154949}} {"created_at": "2025-04-20T12:22:25.442548", "global_step": 158000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2781569965870307, "acc_stderr,none": 0.013094469919538804, "acc_norm,none": 0.3378839590443686, "acc_norm_stderr,none": 0.013822047922283517}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5883838383838383, "acc_stderr,none": 0.010098218646714908, "acc_norm,none": 0.5517676767676768, "acc_norm_stderr,none": 0.010204645126856933}, "boolq": {"alias": "boolq", "acc,none": 0.6293577981651376, "acc_stderr,none": 0.008447316806409937}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2072072072072072, "acc_stderr,none": 0.011603856781422558}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3833897629954192, "acc_stderr,none": 0.00485218262127426, "acc_norm,none": 0.4971121290579566, "acc_norm_stderr,none": 0.004989698183207835}, "mmlu": {"acc,none": 0.241917105825381, "acc_stderr,none": 0.0036102770656897595, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2461211477151966, "acc_stderr,none": 0.006273734662273917, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04216370213557835}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.0328766675860349}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955924}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302871}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0230836585869842}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19292604501607716, "acc_stderr,none": 0.022411516780911363}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.022899162918445806}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24837027379400262, "acc_stderr,none": 0.01103521259803449}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.036155076303109344}, "mmlu_other": {"acc,none": 0.25555197940135177, "acc_stderr,none": 0.00781099567319345, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106748}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036625}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.039166677628225836}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2515964240102171, "acc_stderr,none": 0.015517322365529638}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.28104575163398693, "acc_stderr,none": 0.02573885479781873}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23049645390070922, "acc_stderr,none": 0.025123739226872405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22426470588235295, "acc_stderr,none": 0.02533684856333237}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553026}, "mmlu_social_sciences": {"acc,none": 0.23399415014624633, "acc_stderr,none": 0.0076288104921593035, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.037752050135836386}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1919191919191919, "acc_stderr,none": 0.02805779167298901}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21761658031088082, "acc_stderr,none": 0.029778663037752954}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23846153846153847, "acc_stderr,none": 0.02160629449464773}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2605042016806723, "acc_stderr,none": 0.028510251512341933}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22018348623853212, "acc_stderr,none": 0.01776597865232757}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.31297709923664124, "acc_stderr,none": 0.04066962905677697}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2434640522875817, "acc_stderr,none": 0.017362473762146634}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19591836734693877, "acc_stderr,none": 0.025409301953225678}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.22993973993022518, "acc_stderr,none": 0.007496533941492668, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502652}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21710526315789475, "acc_stderr,none": 0.033550453048829226}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.037455547914624555}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.044405219061793254}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.02113285918275444}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.19032258064516128, "acc_stderr,none": 0.022331707611823078}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.18719211822660098, "acc_stderr,none": 0.027444924966882618}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655106}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25, "acc_stderr,none": 0.029531221160930918}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "mmlu_pro": {"exact_match,custom-extract": 0.10206117021276596, "exact_match_stderr,custom-extract": 0.002750118346179492, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11157601115760112, "exact_match_stderr,custom-extract": 0.011766276311081417}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09252217997465145, "exact_match_stderr,custom-extract": 0.010322332141863073}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06625441696113074, "exact_match_stderr,custom-extract": 0.007395889196467735}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08292682926829269, "exact_match_stderr,custom-extract": 0.013636027558244166}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14928909952606634, "exact_match_stderr,custom-extract": 0.012274145317879097}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0846233230134159, "exact_match_stderr,custom-extract": 0.008945554797547937}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11124694376528117, "exact_match_stderr,custom-extract": 0.011000782283753606}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13385826771653545, "exact_match_stderr,custom-extract": 0.017467280079326592}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07447774750227067, "exact_match_stderr,custom-extract": 0.007916083319262582}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0695780903034789, "exact_match_stderr,custom-extract": 0.006924833446490191}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11904761904761904, "exact_match_stderr,custom-extract": 0.010659472740112141}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14829659318637275, "exact_match_stderr,custom-extract": 0.0159255744939775}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11393379522709776, "exact_match_stderr,custom-extract": 0.008819054413543095}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13533834586466165, "exact_match_stderr,custom-extract": 0.01211725844922514}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.202, "acc_stderr,none": 0.017973260031288265, "acc_norm,none": 0.338, "acc_norm_stderr,none": 0.02117566569520941}, "piqa": {"alias": "piqa", "acc,none": 0.6501632208922742, "acc_stderr,none": 0.011127288644632837, "acc_norm,none": 0.6572361262241567, "acc_norm_stderr,none": 0.011073978007039312}, "race": {"alias": "race", "acc,none": 0.3349282296650718, "acc_stderr,none": 0.014606961503556257}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40583418628454454, "acc_stderr,none": 0.011111610832965833}, "winogrande": {"alias": "winogrande", "acc,none": 0.5611681136543015, "acc_stderr,none": 0.013946933444507034}} {"created_at": "2025-04-20T14:08:15.733990", "global_step": 160000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2781569965870307, "acc_stderr,none": 0.013094469919538804, "acc_norm,none": 0.32849829351535836, "acc_norm_stderr,none": 0.01372497846553736}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5496632996632996, "acc_stderr,none": 0.010209047724374152, "acc_norm,none": 0.5248316498316499, "acc_norm_stderr,none": 0.01024712312215928}, "boolq": {"alias": "boolq", "acc,none": 0.5590214067278287, "acc_stderr,none": 0.008683913982298869}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091195}, "copa": {"alias": "copa", "acc,none": 0.63, "acc_stderr,none": 0.048523658709391}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.37532364070902213, "acc_stderr,none": 0.004832167854501649, "acc_norm,none": 0.48685520812587135, "acc_norm_stderr,none": 0.004988056789119667}, "mmlu": {"acc,none": 0.23451075345392394, "acc_stderr,none": 0.003570218509176988, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24760892667375134, "acc_stderr,none": 0.0062888437386881485, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.03932537680392869}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.033175059300091805}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.03114557065948678}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.31223628691983124, "acc_stderr,none": 0.030165137867847008}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.21487603305785125, "acc_stderr,none": 0.03749492448709698}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.041331194402438376}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.1901840490797546, "acc_stderr,none": 0.030833491146281245}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.02361867831006937}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2090032154340836, "acc_stderr,none": 0.023093140398374224}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023132376234543336}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2522816166883963, "acc_stderr,none": 0.011092789056875238}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.23978113936272932, "acc_stderr,none": 0.00764054308478507, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.19653179190751446, "acc_stderr,none": 0.030299574664788147}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.15, "acc_stderr,none": 0.03588702812826371}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3273542600896861, "acc_stderr,none": 0.03149384670994131}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.02961432369045665}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23371647509578544, "acc_stderr,none": 0.015133383278988822}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.024404394928087866}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729903}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19852941176470587, "acc_stderr,none": 0.024231013370541083}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.03484331592680588}, "mmlu_social_sciences": {"acc,none": 0.21644458888527787, "acc_stderr,none": 0.007424782267857647, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.19298245614035087, "acc_stderr,none": 0.037124548537213684}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.027479603010538787}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21025641025641026, "acc_stderr,none": 0.02066059748502693}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.027722065493361273}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1871559633027523, "acc_stderr,none": 0.016722684526200154}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.03641297081313729}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24673202614379086, "acc_stderr,none": 0.017440820367402507}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.040693063197213754}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2, "acc_stderr,none": 0.025607375986579157}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.029705284056772443}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_stem": {"acc,none": 0.2274024738344434, "acc_stderr,none": 0.007453266065297652, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325436}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.029241883869628817}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.02084229093011467}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1774193548387097, "acc_stderr,none": 0.02173254068932927}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.18719211822660098, "acc_stderr,none": 0.027444924966882618}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.025787874220959323}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473836}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.02792096314799366}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3482142857142857, "acc_stderr,none": 0.04521829902833585}, "mmlu_pro": {"exact_match,custom-extract": 0.10987367021276596, "exact_match_stderr,custom-extract": 0.0028451808155081663, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1199442119944212, "exact_match_stderr,custom-extract": 0.012141944360521245}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10266159695817491, "exact_match_stderr,custom-extract": 0.010812323380686585}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07950530035335689, "exact_match_stderr,custom-extract": 0.008044098592471977}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1048780487804878, "exact_match_stderr,custom-extract": 0.015150318019731044}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.15165876777251186, "exact_match_stderr,custom-extract": 0.012353933579536028}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09803921568627451, "exact_match_stderr,custom-extract": 0.009557758729735868}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11491442542787286, "exact_match_stderr,custom-extract": 0.011157550931380883}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11023622047244094, "exact_match_stderr,custom-extract": 0.01606599843477817}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09173478655767484, "exact_match_stderr,custom-extract": 0.008703161154686665}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0843819393042191, "exact_match_stderr,custom-extract": 0.0075651064286399875}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13203463203463203, "exact_match_stderr,custom-extract": 0.011142798517705907}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1342685370741483, "exact_match_stderr,custom-extract": 0.015277913884522445}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10623556581986143, "exact_match_stderr,custom-extract": 0.008552816527360382}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15162907268170425, "exact_match_stderr,custom-extract": 0.01270442364591536}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.198, "acc_stderr,none": 0.017838958963847237, "acc_norm,none": 0.33, "acc_norm_stderr,none": 0.021049612166134806}, "piqa": {"alias": "piqa", "acc,none": 0.6311207834602829, "acc_stderr,none": 0.011257546676908809, "acc_norm,none": 0.6327529923830251, "acc_norm_stderr,none": 0.011247128539690558}, "race": {"alias": "race", "acc,none": 0.3311004784688995, "acc_stderr,none": 0.014564986871061022}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.38024564994882293, "acc_stderr,none": 0.010984765684099734}, "winogrande": {"alias": "winogrande", "acc,none": 0.5256511444356748, "acc_stderr,none": 0.014033980956108557}} {"created_at": "2025-04-20T15:58:12.610021", "global_step": 162000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.29266211604095566, "acc_stderr,none": 0.013295916103619423, "acc_norm,none": 0.3293515358361775, "acc_norm_stderr,none": 0.013734057652635474}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6182659932659933, "acc_stderr,none": 0.00996864885183967, "acc_norm,none": 0.5845959595959596, "acc_norm_stderr,none": 0.010111869494911524}, "boolq": {"alias": "boolq", "acc,none": 0.6278287461773701, "acc_stderr,none": 0.008454434247373901}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1981981981981982, "acc_stderr,none": 0.011413095456219316}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.045126085985421276}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.388070105556662, "acc_stderr,none": 0.004863147544177514, "acc_norm,none": 0.5049790878311093, "acc_norm_stderr,none": 0.004989533998820356}, "mmlu": {"acc,none": 0.23095000712149266, "acc_stderr,none": 0.0035507166839477563, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2452709883103082, "acc_stderr,none": 0.006268214176779303, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0404061017820884}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.033175059300091805}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.03192193448934722}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123567}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2581005586592179, "acc_stderr,none": 0.014635185616527836}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.17684887459807075, "acc_stderr,none": 0.02167005888551079}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.20987654320987653, "acc_stderr,none": 0.02265834408598137}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2470664928292047, "acc_stderr,none": 0.011015752255279333}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708312}, "mmlu_other": {"acc,none": 0.24074670099774703, "acc_stderr,none": 0.007644575961560687, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.20754716981132076, "acc_stderr,none": 0.02495991802891127}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.030952890217749884}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.15, "acc_stderr,none": 0.03588702812826371}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02987257770889117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23627075351213284, "acc_stderr,none": 0.015190473717037493}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.0239291555173513}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.025518731049537762}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511115}, "mmlu_social_sciences": {"acc,none": 0.2203444913877153, "acc_stderr,none": 0.007473543200293327, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.027479603010538804}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462874}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.2018348623853211, "acc_stderr,none": 0.017208579357787565}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728743}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2434640522875817, "acc_stderr,none": 0.01736247376214663}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.04172343038705383}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.025206963154225423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.03014777593540922}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.21027592768791628, "acc_stderr,none": 0.007253742127923283, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.0327900040631005}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.24680851063829787, "acc_stderr,none": 0.028185441301234102}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.19576719576719576, "acc_stderr,none": 0.020435730971541798}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1774193548387097, "acc_stderr,none": 0.02173254068932927}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1477832512315271, "acc_stderr,none": 0.024969621333521274}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.02504044387700069}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18055555555555555, "acc_stderr,none": 0.02623287897149166}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "mmlu_pro": {"exact_match,custom-extract": 0.08244680851063829, "exact_match_stderr,custom-extract": 0.0024991004261953227, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.07531380753138076, "exact_match_stderr,custom-extract": 0.009862294734961778}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10392902408111533, "exact_match_stderr,custom-extract": 0.010871175856870044}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.037102473498233215, "exact_match_stderr,custom-extract": 0.005620308630822619}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.06341463414634146, "exact_match_stderr,custom-extract": 0.01205054740332862}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.10781990521327015, "exact_match_stderr,custom-extract": 0.010682230633557251}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09184726522187822, "exact_match_stderr,custom-extract": 0.009282712153833542}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12102689486552567, "exact_match_stderr,custom-extract": 0.011410842488489002}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10498687664041995, "exact_match_stderr,custom-extract": 0.015724991203554556}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06267029972752043, "exact_match_stderr,custom-extract": 0.0073077003757296135}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06513693560325684, "exact_match_stderr,custom-extract": 0.00671615604474674}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09848484848484848, "exact_match_stderr,custom-extract": 0.009807772312247507}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.09418837675350701, "exact_match_stderr,custom-extract": 0.013088893360161025}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.06543494996150885, "exact_match_stderr,custom-extract": 0.006863921514375278}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11152882205513784, "exact_match_stderr,custom-extract": 0.011150287588558724}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.236, "acc_stderr,none": 0.019008699622084728, "acc_norm,none": 0.364, "acc_norm_stderr,none": 0.021539170637317688}, "piqa": {"alias": "piqa", "acc,none": 0.6670293797606094, "acc_stderr,none": 0.010995648822619077, "acc_norm,none": 0.6664853101196954, "acc_norm_stderr,none": 0.011000139592184575}, "race": {"alias": "race", "acc,none": 0.35119617224880384, "acc_stderr,none": 0.014773430019036974}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4083930399181167, "acc_stderr,none": 0.011122558066098066}, "winogrande": {"alias": "winogrande", "acc,none": 0.5880031570639306, "acc_stderr,none": 0.01383311285764593}} {"created_at": "2025-04-20T17:51:50.048318", "global_step": 164000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.31569965870307165, "acc_stderr,none": 0.01358257109581529, "acc_norm,none": 0.3412969283276451, "acc_norm_stderr,none": 0.013855831287497724}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6502525252525253, "acc_stderr,none": 0.009785578618940732, "acc_norm,none": 0.6174242424242424, "acc_norm_stderr,none": 0.009972837790531477}, "boolq": {"alias": "boolq", "acc,none": 0.6042813455657492, "acc_stderr,none": 0.008552742471459797}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19328419328419327, "acc_stderr,none": 0.011305207486827706}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.046882617226215034}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3820952001593308, "acc_stderr,none": 0.004849065962692141, "acc_norm,none": 0.5, "acc_norm_stderr,none": 0.004989781411445852}, "mmlu": {"acc,none": 0.23557897735365332, "acc_stderr,none": 0.0035765468572017613, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2454835281615303, "acc_stderr,none": 0.006274156181240141, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.038095238095238126}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.0340150671524904}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.03019028245350194}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.02931281415395592}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507437}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.02344582627654555}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19292604501607716, "acc_stderr,none": 0.022411516780911363}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.022899162918445813}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25488917861799215, "acc_stderr,none": 0.01113050981266297}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824565}, "mmlu_other": {"acc,none": 0.24750563244287094, "acc_stderr,none": 0.007724665575041185, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899098}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.34080717488789236, "acc_stderr,none": 0.031811497470553604}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.029996951858349476}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23627075351213284, "acc_stderr,none": 0.0151904737170375}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.02417084087934102}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.026011992930902013}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.02388688192244033}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233134}, "mmlu_social_sciences": {"acc,none": 0.2252193695157621, "acc_stderr,none": 0.007525313096552954, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.040493392977481425}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20202020202020202, "acc_stderr,none": 0.028606204289229876}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551975}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.026841514322958945}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.2, "acc_stderr,none": 0.017149858514250934}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467766}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.017479487001364764}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878284}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19591836734693877, "acc_stderr,none": 0.025409301953225678}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.03014777593540922}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_stem": {"acc,none": 0.21915635902315256, "acc_stderr,none": 0.007360050426070162, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.03317672787533158}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768081}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2723404255319149, "acc_stderr,none": 0.029101290698386698}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.02094048156533485}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2, "acc_stderr,none": 0.022755204959542936}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1921182266009852, "acc_stderr,none": 0.027719315709614785}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22592592592592592, "acc_stderr,none": 0.025497532639609553}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.03257847384436777}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.024536326026134217}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.0959940159574468, "exact_match_stderr,custom-extract": 0.002675007560077223, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11297071129707113, "exact_match_stderr,custom-extract": 0.011830290263473058}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10012674271229405, "exact_match_stderr,custom-extract": 0.01069307487996213}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.044169611307420496, "exact_match_stderr,custom-extract": 0.006109714311601106}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1024390243902439, "exact_match_stderr,custom-extract": 0.014993500684238487}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14218009478672985, "exact_match_stderr,custom-extract": 0.012028283958485729}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0846233230134159, "exact_match_stderr,custom-extract": 0.008945554797547963}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12469437652811736, "exact_match_stderr,custom-extract": 0.011558254824072243}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11811023622047244, "exact_match_stderr,custom-extract": 0.01655614119804242}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06993642143505904, "exact_match_stderr,custom-extract": 0.00768974041363005}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07031828275351591, "exact_match_stderr,custom-extract": 0.00695880054927048}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11796536796536797, "exact_match_stderr,custom-extract": 0.010617425726799204}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11222444889779559, "exact_match_stderr,custom-extract": 0.014144273960803913}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08622016936104696, "exact_match_stderr,custom-extract": 0.0077909043682681525}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13157894736842105, "exact_match_stderr,custom-extract": 0.011973723215900397}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.236, "acc_stderr,none": 0.019008699622084735, "acc_norm,none": 0.356, "acc_norm_stderr,none": 0.021434712356072645}, "piqa": {"alias": "piqa", "acc,none": 0.6556039173014145, "acc_stderr,none": 0.011086521237125621, "acc_norm,none": 0.6643090315560392, "acc_norm_stderr,none": 0.01101793811665632}, "race": {"alias": "race", "acc,none": 0.3244019138755981, "acc_stderr,none": 0.014488908168432263}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4048106448311157, "acc_stderr,none": 0.011107144401926746}, "winogrande": {"alias": "winogrande", "acc,none": 0.5785319652722968, "acc_stderr,none": 0.013878072377497603}} {"created_at": "2025-04-20T19:43:43.769390", "global_step": 166000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3046075085324232, "acc_stderr,none": 0.013449522109932487, "acc_norm,none": 0.35409556313993173, "acc_norm_stderr,none": 0.013975454122756557}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6212121212121212, "acc_stderr,none": 0.009953737656542042, "acc_norm,none": 0.5883838383838383, "acc_norm_stderr,none": 0.010098218646714908}, "boolq": {"alias": "boolq", "acc,none": 0.6168195718654435, "acc_stderr,none": 0.008503021391450784}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091192}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.04512608598542128}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38398725353515234, "acc_stderr,none": 0.004853608805843882, "acc_norm,none": 0.4949213304122685, "acc_norm_stderr,none": 0.0049895240030924356}, "mmlu": {"acc,none": 0.2605754166073209, "acc_stderr,none": 0.0036947639446499383, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2461211477151966, "acc_stderr,none": 0.0062810595776897704, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604674}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139405}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.17355371900826447, "acc_stderr,none": 0.03457272836917671}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.04133119440243839}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.03462419931615624}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.022698657167855713}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.26145251396648045, "acc_stderr,none": 0.014696599650364548}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19292604501607716, "acc_stderr,none": 0.022411516780911366}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24902216427640156, "acc_stderr,none": 0.01104489226404077}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946956}, "mmlu_other": {"acc,none": 0.24364338590280013, "acc_stderr,none": 0.007686626243483085, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2792452830188679, "acc_stderr,none": 0.027611163402399715}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.035331333893236574}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.17937219730941703, "acc_stderr,none": 0.02574981956919277}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.04453254836326468}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24786324786324787, "acc_stderr,none": 0.0282863240755644}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.20945083014048532, "acc_stderr,none": 0.014551310568143691}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.30718954248366015, "acc_stderr,none": 0.026415601914388995}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.0258921511567094}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.25, "acc_stderr,none": 0.026303648393696036}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.23493975903614459, "acc_stderr,none": 0.03300533186128922}, "mmlu_social_sciences": {"acc,none": 0.29054273643158923, "acc_stderr,none": 0.008176439728145614, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.03318477333845331}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.31088082901554404, "acc_stderr,none": 0.03340361906276586}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.30512820512820515, "acc_stderr,none": 0.023346335293325884}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.031041941304059278}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27889908256880735, "acc_stderr,none": 0.019227468876463517}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.017917974069594726}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.038950910157241364}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.34285714285714286, "acc_stderr,none": 0.030387262919547728}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2885572139303483, "acc_stderr,none": 0.03203841040213321}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.2695845226768157, "acc_stderr,none": 0.007865887468403509, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174023}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.04617034827006716}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.251063829787234, "acc_stderr,none": 0.028346963777162452}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776568}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2709677419354839, "acc_stderr,none": 0.025284416114900156}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22167487684729065, "acc_stderr,none": 0.029225575892489614}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036843}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.02659393910184407}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4212962962962963, "acc_stderr,none": 0.03367462138896078}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03894641120044792}, "mmlu_pro": {"exact_match,custom-extract": 0.10671542553191489, "exact_match_stderr,custom-extract": 0.0028047374000536108, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13110181311018132, "exact_match_stderr,custom-extract": 0.012613403336459893}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10392902408111533, "exact_match_stderr,custom-extract": 0.010871175856870044}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0636042402826855, "exact_match_stderr,custom-extract": 0.007256738135342962}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1048780487804878, "exact_match_stderr,custom-extract": 0.015150318019731046}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14928909952606634, "exact_match_stderr,custom-extract": 0.012274145317879102}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0804953560371517, "exact_match_stderr,custom-extract": 0.008744292925925094}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1393643031784841, "exact_match_stderr,custom-extract": 0.01211642290497962}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14435695538057744, "exact_match_stderr,custom-extract": 0.018029071904027544}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10717529518619437, "exact_match_stderr,custom-extract": 0.009326830860379755}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07475943745373798, "exact_match_stderr,custom-extract": 0.007158029108285764}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13095238095238096, "exact_match_stderr,custom-extract": 0.01110395354203079}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13226452905811623, "exact_match_stderr,custom-extract": 0.015181011139551248}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08468052347959969, "exact_match_stderr,custom-extract": 0.007727531295642343}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13032581453634084, "exact_match_stderr,custom-extract": 0.011925163793228655}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.218, "acc_stderr,none": 0.018483378223178866, "acc_norm,none": 0.342, "acc_norm_stderr,none": 0.021236147199899257}, "piqa": {"alias": "piqa", "acc,none": 0.6545157780195865, "acc_stderr,none": 0.011094802893617757, "acc_norm,none": 0.6692056583242655, "acc_norm_stderr,none": 0.010977520584714445}, "race": {"alias": "race", "acc,none": 0.32344497607655504, "acc_stderr,none": 0.01447776480941772}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4196519959058342, "acc_stderr,none": 0.01116703230339054}, "winogrande": {"alias": "winogrande", "acc,none": 0.580110497237569, "acc_stderr,none": 0.013870943986310391}} {"created_at": "2025-04-20T21:25:46.406509", "global_step": 168000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2977815699658703, "acc_stderr,none": 0.013363080107244487, "acc_norm,none": 0.32764505119453924, "acc_norm_stderr,none": 0.01371584794071934}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6191077441077442, "acc_stderr,none": 0.009964428212260384, "acc_norm,none": 0.6035353535353535, "acc_norm_stderr,none": 0.010037412763064522}, "boolq": {"alias": "boolq", "acc,none": 0.5070336391437309, "acc_stderr,none": 0.008744189661475108}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1981981981981982, "acc_stderr,none": 0.011413095456219316}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.04461960433384741}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3866759609639514, "acc_stderr,none": 0.004859930926500306, "acc_norm,none": 0.5033857797251543, "acc_norm_stderr,none": 0.004989667009372652}, "mmlu": {"acc,none": 0.25964962256088875, "acc_stderr,none": 0.00369148653518781, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.26992561105207225, "acc_stderr,none": 0.006464300067453032, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.0340150671524904}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.029771775228145638}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3305785123966942, "acc_stderr,none": 0.04294340845212095}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.04668408033024931}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.294478527607362, "acc_stderr,none": 0.03581165790474082}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2745664739884393, "acc_stderr,none": 0.02402774515526501}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24916201117318434, "acc_stderr,none": 0.01446589382985993}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3311897106109325, "acc_stderr,none": 0.02673062072800491}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.025171041915309684}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26988265971316816, "acc_stderr,none": 0.011337381084250423}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2046783625730994, "acc_stderr,none": 0.030944459778533197}, "mmlu_other": {"acc,none": 0.2661731573865465, "acc_stderr,none": 0.007891628060114187, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2528301886792453, "acc_stderr,none": 0.026749899771241238}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939098}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.35874439461883406, "acc_stderr,none": 0.03219079200419996}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.044986763205729224}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674057}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.016328814422102052}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.023929155517351294}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.026011992930902006}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16911764705882354, "acc_stderr,none": 0.022770868010113004}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.034843315926805875}, "mmlu_social_sciences": {"acc,none": 0.24081897952551187, "acc_stderr,none": 0.007691222955466088, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.038351539543994194}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30808080808080807, "acc_stderr,none": 0.03289477330098614}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.030276909945178256}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.021020672680827912}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.027025433498882374}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.2018348623853211, "acc_stderr,none": 0.017208579357787572}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.0359546161177469}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.29248366013071897, "acc_stderr,none": 0.018403415710109783}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644286}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24081632653061225, "acc_stderr,none": 0.027372942201788163}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.19402985074626866, "acc_stderr,none": 0.027962677604768914}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.2562638756739613, "acc_stderr,none": 0.0077727063934584254, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04072314811876837}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.0355418036802569}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653697}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28936170212765955, "acc_stderr,none": 0.02964400657700962}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948365}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2709677419354839, "acc_stderr,none": 0.02528441611490016}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.26108374384236455, "acc_stderr,none": 0.030903796952114485}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.02742001935094527}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.0347918557259966}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.026991454502036723}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340455}, "mmlu_pro": {"exact_match,custom-extract": 0.0993184840425532, "exact_match_stderr,custom-extract": 0.002720067080923944, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09205020920502092, "exact_match_stderr,custom-extract": 0.010804055220350202}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11153358681875793, "exact_match_stderr,custom-extract": 0.011213991771867694}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05742049469964664, "exact_match_stderr,custom-extract": 0.006917690995369925}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13170731707317074, "exact_match_stderr,custom-extract": 0.016721543700347667}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14099526066350712, "exact_match_stderr,custom-extract": 0.011986330546463385}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09081527347781218, "exact_match_stderr,custom-extract": 0.009235657832562175}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12591687041564792, "exact_match_stderr,custom-extract": 0.011606661034408541}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12335958005249344, "exact_match_stderr,custom-extract": 0.016869623436798514}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09900090826521345, "exact_match_stderr,custom-extract": 0.009005035380672356}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07623982235381199, "exact_match_stderr,custom-extract": 0.0072227681079820505}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11471861471861472, "exact_match_stderr,custom-extract": 0.010489547712821732}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.09218436873747494, "exact_match_stderr,custom-extract": 0.012963217262821293}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08314087759815242, "exact_match_stderr,custom-extract": 0.007663395880276764}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11654135338345864, "exact_match_stderr,custom-extract": 0.011365903926393264}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.236, "acc_stderr,none": 0.019008699622084724, "acc_norm,none": 0.36, "acc_norm_stderr,none": 0.02148775108972052}, "piqa": {"alias": "piqa", "acc,none": 0.6686615886833515, "acc_stderr,none": 0.010982077458957348, "acc_norm,none": 0.6789989118607181, "acc_norm_stderr,none": 0.010892641574707904}, "race": {"alias": "race", "acc,none": 0.3311004784688995, "acc_stderr,none": 0.014564986871061022}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.417093142272262, "acc_stderr,none": 0.011157450926787525}, "winogrande": {"alias": "winogrande", "acc,none": 0.595895816890292, "acc_stderr,none": 0.013791610664670844}} {"created_at": "2025-04-20T23:09:59.345410", "global_step": 170000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2909556313993174, "acc_stderr,none": 0.013273077865907588, "acc_norm,none": 0.3267918088737201, "acc_norm_stderr,none": 0.013706665975587333}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6195286195286195, "acc_stderr,none": 0.009962305992058574, "acc_norm,none": 0.5921717171717171, "acc_norm_stderr,none": 0.010083950240041205}, "boolq": {"alias": "boolq", "acc,none": 0.6382262996941896, "acc_stderr,none": 0.008404238796949261}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202903}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38637721569408484, "acc_stderr,none": 0.004859236191579797, "acc_norm,none": 0.5018920533758215, "acc_norm_stderr,none": 0.004989745685820425}, "mmlu": {"acc,none": 0.25566158666856575, "acc_stderr,none": 0.0036781913107501447, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24697130712008503, "acc_stderr,none": 0.006283520557454611, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3492063492063492, "acc_stderr,none": 0.042639068927951315}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.035243908445117836}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.03096451792692339}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.21518987341772153, "acc_stderr,none": 0.02675082699467617}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19834710743801653, "acc_stderr,none": 0.036401182719909456}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.04524596007030048}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615769}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.0218552552634218}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2435754189944134, "acc_stderr,none": 0.014355911964767864}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24758842443729903, "acc_stderr,none": 0.024513879973621967}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.023788583551658533}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24902216427640156, "acc_stderr,none": 0.01104489226404077}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.25146198830409355, "acc_stderr,none": 0.033275044238468436}, "mmlu_other": {"acc,none": 0.24975860959124557, "acc_stderr,none": 0.0077604325354478434, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.29056603773584905, "acc_stderr,none": 0.02794321998933714}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.0332055644308557}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.17937219730941703, "acc_stderr,none": 0.025749819569192794}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.04354631077260595}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23931623931623933, "acc_stderr,none": 0.027951826808924333}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24010217113665389, "acc_stderr,none": 0.015274685213734191}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2875816993464052, "acc_stderr,none": 0.02591780611714716}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2907801418439716, "acc_stderr,none": 0.027090664368353178}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.25735294117647056, "acc_stderr,none": 0.02655651947004153}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064536}, "mmlu_social_sciences": {"acc,none": 0.2694182645433864, "acc_stderr,none": 0.007995596520839643, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.03775205013583637}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.26262626262626265, "acc_stderr,none": 0.03135305009533085}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.32642487046632124, "acc_stderr,none": 0.033840286211432945}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2743589743589744, "acc_stderr,none": 0.02262276576749322}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31932773109243695, "acc_stderr,none": 0.030283995525884396}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.28807339449541286, "acc_stderr,none": 0.01941644589263602}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26633986928104575, "acc_stderr,none": 0.01788318813466719}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878285}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.027212835884073142}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.02970528405677243}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.26102124960355216, "acc_stderr,none": 0.00781389546733295, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.17777777777777778, "acc_stderr,none": 0.03302789859901719}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03583496176361063}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03476590104304134}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.04533838195929774}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.22127659574468084, "acc_stderr,none": 0.02713634960242405}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.02193587808118476}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2838709677419355, "acc_stderr,none": 0.02564938106302926}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.031270907132977}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844065}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3425925925925926, "acc_stderr,none": 0.03236585252602156}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.03952301967702511}, "mmlu_pro": {"exact_match,custom-extract": 0.09242021276595745, "exact_match_stderr,custom-extract": 0.0026279879328295875, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.10739191073919108, "exact_match_stderr,custom-extract": 0.011570701311353664}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11660329531051965, "exact_match_stderr,custom-extract": 0.01143326292260529}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.04328621908127209, "exact_match_stderr,custom-extract": 0.006051102910695479}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09024390243902439, "exact_match_stderr,custom-extract": 0.014168039768581506}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11966824644549763, "exact_match_stderr,custom-extract": 0.011178894558789445}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07120743034055728, "exact_match_stderr,custom-extract": 0.008265789561265736}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1295843520782396, "exact_match_stderr,custom-extract": 0.011749749223850718}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10236220472440945, "exact_match_stderr,custom-extract": 0.015549935163883115}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08628519527702089, "exact_match_stderr,custom-extract": 0.008465977919833967}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.04885270170244264, "exact_match_stderr,custom-extract": 0.005866800202950919}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13203463203463203, "exact_match_stderr,custom-extract": 0.011142798517705915}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11623246492985972, "exact_match_stderr,custom-extract": 0.014362104240159225}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08237105465742879, "exact_match_stderr,custom-extract": 0.007631036296293874}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11779448621553884, "exact_match_stderr,custom-extract": 0.011418740524805817}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.228, "acc_stderr,none": 0.018781306529363197, "acc_norm,none": 0.354, "acc_norm_stderr,none": 0.021407582047916447}, "piqa": {"alias": "piqa", "acc,none": 0.6588683351468988, "acc_stderr,none": 0.0110612894439627, "acc_norm,none": 0.6621327529923831, "acc_norm_stderr,none": 0.011035474307853845}, "race": {"alias": "race", "acc,none": 0.3349282296650718, "acc_stderr,none": 0.014606961503556259}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4053224155578301, "acc_stderr,none": 0.011109383877623348}, "winogrande": {"alias": "winogrande", "acc,none": 0.585635359116022, "acc_stderr,none": 0.01384484623226856}} {"created_at": "2025-04-21T00:52:39.606254", "global_step": 172000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.30802047781569963, "acc_stderr,none": 0.01349142951729204, "acc_norm,none": 0.3464163822525597, "acc_norm_stderr,none": 0.013905011180063246}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6481481481481481, "acc_stderr,none": 0.009799078929868706, "acc_norm,none": 0.6317340067340067, "acc_norm_stderr,none": 0.009897286209010892}, "boolq": {"alias": "boolq", "acc,none": 0.6571865443425077, "acc_stderr,none": 0.008301676410578645}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18755118755118755, "acc_stderr,none": 0.011175783964114746}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.046882617226215034}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3913563035251942, "acc_stderr,none": 0.004870563921220621, "acc_norm,none": 0.507468631746664, "acc_norm_stderr,none": 0.004989224715784541}, "mmlu": {"acc,none": 0.25103261643640506, "acc_stderr,none": 0.003657123740110867, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24654622741764082, "acc_stderr,none": 0.006279443093521198, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.038932596106046755}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2, "acc_stderr,none": 0.031234752377721175}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591362}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.02917868230484256}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04065578140908705}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.022497230190967547}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2435754189944134, "acc_stderr,none": 0.014355911964767864}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19935691318327975, "acc_stderr,none": 0.022691033780549656}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2006172839506173, "acc_stderr,none": 0.02228231394977489}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2633637548891786, "acc_stderr,none": 0.01124950640360531}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30994152046783624, "acc_stderr,none": 0.035469769593931624}, "mmlu_other": {"acc,none": 0.25523012552301255, "acc_stderr,none": 0.007819468410561178, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.025447863825108608}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3094170403587444, "acc_stderr,none": 0.031024411740572206}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.31196581196581197, "acc_stderr,none": 0.030351527323344937}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24904214559386972, "acc_stderr,none": 0.015464676163395974}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.02428861946604611}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.025645553622266736}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2536764705882353, "acc_stderr,none": 0.02643132987078954}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21686746987951808, "acc_stderr,none": 0.03208284450356365}, "mmlu_social_sciences": {"acc,none": 0.25901852453688656, "acc_stderr,none": 0.007903359604755987, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23737373737373738, "acc_stderr,none": 0.030313710538198913}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27461139896373055, "acc_stderr,none": 0.032210245080411544}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24871794871794872, "acc_stderr,none": 0.0219169577092138}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.029953823891887044}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25137614678899084, "acc_stderr,none": 0.01859920636028741}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2826797385620915, "acc_stderr,none": 0.018217269552053442}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.041220665028782855}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24081632653061225, "acc_stderr,none": 0.02737294220178817}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_stem": {"acc,none": 0.2457976530288614, "acc_stderr,none": 0.007655395187168924, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.14, "acc_stderr,none": 0.03487350880197772}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2236842105263158, "acc_stderr,none": 0.03391160934343604}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2986111111111111, "acc_stderr,none": 0.03827052357950756}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206824}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.04617034827006718}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.24680851063829787, "acc_stderr,none": 0.028185441301234095}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.19310344827586207, "acc_stderr,none": 0.03289445522127401}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.022418042891113935}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25483870967741934, "acc_stderr,none": 0.02479011845933221}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2019704433497537, "acc_stderr,none": 0.02824735012218027}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.026067159222275784}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.0347918557259966}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422273}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.19642857142857142, "acc_stderr,none": 0.037709700493470194}, "mmlu_pro": {"exact_match,custom-extract": 0.09890292553191489, "exact_match_stderr,custom-extract": 0.002717261364125198, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.07391910739191074, "exact_match_stderr,custom-extract": 0.00977791612721811}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09252217997465145, "exact_match_stderr,custom-extract": 0.010322332141863079}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07420494699646643, "exact_match_stderr,custom-extract": 0.0077936797285690996}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12439024390243902, "exact_match_stderr,custom-extract": 0.01631874671019561}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12085308056872038, "exact_match_stderr,custom-extract": 0.011226536807498494}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09391124871001032, "exact_match_stderr,custom-extract": 0.009375760359013278}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12102689486552567, "exact_match_stderr,custom-extract": 0.011410842488489002}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11023622047244094, "exact_match_stderr,custom-extract": 0.01606599843477818}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09536784741144415, "exact_match_stderr,custom-extract": 0.008856062181125272}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07549962990377498, "exact_match_stderr,custom-extract": 0.007190499688409162}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12121212121212122, "exact_match_stderr,custom-extract": 0.01074271871039956}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12424849699398798, "exact_match_stderr,custom-extract": 0.014781596611020435}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.0869899923017706, "exact_match_stderr,custom-extract": 0.00782231082493139}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12656641604010024, "exact_match_stderr,custom-extract": 0.011777280638403536}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.24, "acc_stderr,none": 0.019118866653759756, "acc_norm,none": 0.342, "acc_norm_stderr,none": 0.021236147199899257}, "piqa": {"alias": "piqa", "acc,none": 0.6556039173014145, "acc_stderr,none": 0.011086521237125623, "acc_norm,none": 0.6621327529923831, "acc_norm_stderr,none": 0.011035474307853843}, "race": {"alias": "race", "acc,none": 0.3473684210526316, "acc_stderr,none": 0.014735977850381396}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.417093142272262, "acc_stderr,none": 0.011157450926787526}, "winogrande": {"alias": "winogrande", "acc,none": 0.5808997632202052, "acc_stderr,none": 0.013867325192210114}} {"created_at": "2025-04-21T02:58:12.083050", "global_step": 174000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.2986348122866894, "acc_stderr,none": 0.013374078615068747, "acc_norm,none": 0.3515358361774744, "acc_norm_stderr,none": 0.013952413699600938}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6182659932659933, "acc_stderr,none": 0.009968648851839667, "acc_norm,none": 0.5938552188552189, "acc_norm_stderr,none": 0.010077409815364053}, "boolq": {"alias": "boolq", "acc,none": 0.6461773700305811, "acc_stderr,none": 0.008362983020904468}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20065520065520065, "acc_stderr,none": 0.01146601146601154}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.046882617226215034}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.39364668392750446, "acc_stderr,none": 0.004875595792850676, "acc_norm,none": 0.5052778331009758, "acc_norm_stderr,none": 0.004989503417767285}, "mmlu": {"acc,none": 0.2341546788206808, "acc_stderr,none": 0.0035678347521296574, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2501594048884166, "acc_stderr,none": 0.006311524912660646, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.03809523809523812}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2, "acc_stderr,none": 0.031234752377721175}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.031321798030832904}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2581005586592179, "acc_stderr,none": 0.014635185616527826}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18971061093247588, "acc_stderr,none": 0.02226819625878322}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.022779719088733393}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26140808344198174, "acc_stderr,none": 0.011222528169771312}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30409356725146197, "acc_stderr,none": 0.03528211258245232}, "mmlu_other": {"acc,none": 0.23978113936272932, "acc_stderr,none": 0.00764647869035004, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.20754716981132076, "acc_stderr,none": 0.02495991802891127}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036625}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3094170403587444, "acc_stderr,none": 0.03102441174057219}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.040580420156460344}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3034188034188034, "acc_stderr,none": 0.030118210106942645}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23371647509578544, "acc_stderr,none": 0.015133383278988827}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.0239291555173513}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.025257861359432414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.22294442638934026, "acc_stderr,none": 0.007495341161971284, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022057}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18686868686868688, "acc_stderr,none": 0.027772533334218977}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.18134715025906736, "acc_stderr,none": 0.027807032360686088}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462874}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1908256880733945, "acc_stderr,none": 0.016847676400091105}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.038808483010823944}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2565359477124183, "acc_stderr,none": 0.017667841612378995}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.041723430387053825}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19591836734693877, "acc_stderr,none": 0.025409301953225678}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.03014777593540922}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_stem": {"acc,none": 0.21566761814145258, "acc_stderr,none": 0.007313106356405481, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384739}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.0315469804508223}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.028504856470514192}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.021411684393694196}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.022037217340267846}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1625615763546798, "acc_stderr,none": 0.025960300064605576}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.02488211685765511}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.14814814814814814, "acc_stderr,none": 0.024227629273728363}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.04157751539865629}, "mmlu_pro": {"exact_match,custom-extract": 0.1085438829787234, "exact_match_stderr,custom-extract": 0.0028284249066464822, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.16178521617852162, "exact_match_stderr,custom-extract": 0.013762285522387244}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10139416983523447, "exact_match_stderr,custom-extract": 0.010752959229023348}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07773851590106007, "exact_match_stderr,custom-extract": 0.007961847521542125}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11463414634146342, "exact_match_stderr,custom-extract": 0.015752762697429715}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.15165876777251186, "exact_match_stderr,custom-extract": 0.012353933579536035}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09907120743034056, "exact_match_stderr,custom-extract": 0.009602432935115188}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1136919315403423, "exact_match_stderr,custom-extract": 0.011105705318793027}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13910761154855644, "exact_match_stderr,custom-extract": 0.017752441192974863}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09264305177111716, "exact_match_stderr,custom-extract": 0.0087417658258632}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.09030347890451518, "exact_match_stderr,custom-extract": 0.007800700851811406}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11796536796536797, "exact_match_stderr,custom-extract": 0.010617425726799199}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1282565130260521, "exact_match_stderr,custom-extract": 0.01498371136300156}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08160123171670516, "exact_match_stderr,custom-extract": 0.007598478818397742}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12781954887218044, "exact_match_stderr,custom-extract": 0.01182694708230729}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.23, "acc_stderr,none": 0.018839050391123144, "acc_norm,none": 0.362, "acc_norm_stderr,none": 0.021513662527582404}, "piqa": {"alias": "piqa", "acc,none": 0.6702937976060935, "acc_stderr,none": 0.010968357083095152, "acc_norm,none": 0.6833514689880305, "acc_norm_stderr,none": 0.010853160531978483}, "race": {"alias": "race", "acc,none": 0.3435406698564593, "acc_stderr,none": 0.014697475413671399}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4053224155578301, "acc_stderr,none": 0.011109383877623348}, "winogrande": {"alias": "winogrande", "acc,none": 0.5990528808208366, "acc_stderr,none": 0.013773974554948026}} {"created_at": "2025-04-21T04:46:41.445718", "global_step": 176000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.29948805460750855, "acc_stderr,none": 0.013385021637313565, "acc_norm,none": 0.3361774744027304, "acc_norm_stderr,none": 0.013804855026205761}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6174242424242424, "acc_stderr,none": 0.009972837790531477, "acc_norm,none": 0.6014309764309764, "acc_norm_stderr,none": 0.010046455400477933}, "boolq": {"alias": "boolq", "acc,none": 0.5587155963302752, "acc_stderr,none": 0.008684548127832634}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19082719082719082, "acc_stderr,none": 0.01125021581097904}, "copa": {"alias": "copa", "acc,none": 0.66, "acc_stderr,none": 0.04760952285695237}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3891655048795061, "acc_stderr,none": 0.004865645485910429, "acc_norm,none": 0.5013941445927106, "acc_norm_stderr,none": 0.004989762014739187}, "mmlu": {"acc,none": 0.22981056829511465, "acc_stderr,none": 0.0035447125229028744, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24250797024442083, "acc_stderr,none": 0.006245137435523674, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848878}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.031922715695482995}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.03058759135160425}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.029178682304842555}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.023176298203992012}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.20679012345679013, "acc_stderr,none": 0.02253500670594282}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24902216427640156, "acc_stderr,none": 0.01104489226404077}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.23945928548439008, "acc_stderr,none": 0.0076408392658152986, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.029872577708891165}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398687}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.0239291555173513}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23049645390070922, "acc_stderr,none": 0.025123739226872405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.023886881922440338}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.21741956451088723, "acc_stderr,none": 0.007432526976248773, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1717171717171717, "acc_stderr,none": 0.026869716187429917}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19170984455958548, "acc_stderr,none": 0.028408953626245282}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.020377660970371386}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936094}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19591836734693877, "acc_stderr,none": 0.025409301953225678}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.2134475103076435, "acc_stderr,none": 0.007283840995270959, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17763157894736842, "acc_stderr,none": 0.031103182383123398}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.02084229093011467}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.17419354838709677, "acc_stderr,none": 0.02157624818451457}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15270935960591134, "acc_stderr,none": 0.025308904539380627}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02534809746809787}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16203703703703703, "acc_stderr,none": 0.025130453652268455}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.09383311170212766, "exact_match_stderr,custom-extract": 0.002650268057901815, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11157601115760112, "exact_match_stderr,custom-extract": 0.011766276311081417}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11660329531051965, "exact_match_stderr,custom-extract": 0.011433262922605296}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.053886925795053005, "exact_match_stderr,custom-extract": 0.006714009070040241}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08536585365853659, "exact_match_stderr,custom-extract": 0.013816694190586966}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12796208530805686, "exact_match_stderr,custom-extract": 0.011505210023672559}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10835913312693499, "exact_match_stderr,custom-extract": 0.00999056535282795}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10268948655256724, "exact_match_stderr,custom-extract": 0.010619971250260032}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09973753280839895, "exact_match_stderr,custom-extract": 0.015371706524248085}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.0653950953678474, "exact_match_stderr,custom-extract": 0.007454015200467411}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06735751295336788, "exact_match_stderr,custom-extract": 0.0068215605036723295}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11580086580086581, "exact_match_stderr,custom-extract": 0.010532466716077542}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10420841683366733, "exact_match_stderr,custom-extract": 0.013691159072055334}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07775211701308699, "exact_match_stderr,custom-extract": 0.007432631448995884}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12907268170426064, "exact_match_stderr,custom-extract": 0.011876239922954068}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.212, "acc_stderr,none": 0.018297037004013885, "acc_norm,none": 0.334, "acc_norm_stderr,none": 0.02111349234774372}, "piqa": {"alias": "piqa", "acc,none": 0.6692056583242655, "acc_stderr,none": 0.01097752058471444, "acc_norm,none": 0.6702937976060935, "acc_norm_stderr,none": 0.010968357083095152}, "race": {"alias": "race", "acc,none": 0.3311004784688995, "acc_stderr,none": 0.014564986871061022}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41760491299897645, "acc_stderr,none": 0.011159391894922484}, "winogrande": {"alias": "winogrande", "acc,none": 0.5824782951854776, "acc_stderr,none": 0.013859978264440251}} {"created_at": "2025-04-21T06:37:55.759478", "global_step": 178000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.31143344709897613, "acc_stderr,none": 0.013532472099850945, "acc_norm,none": 0.3506825938566553, "acc_norm_stderr,none": 0.013944635930726089}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.632996632996633, "acc_stderr,none": 0.009890173658452118, "acc_norm,none": 0.6052188552188552, "acc_norm_stderr,none": 0.01003003893588359}, "boolq": {"alias": "boolq", "acc,none": 0.5896024464831804, "acc_stderr,none": 0.008603488048617521}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21048321048321048, "acc_stderr,none": 0.011671038436522911}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.047258156262526066}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3884684325831508, "acc_stderr,none": 0.004864058877626279, "acc_norm,none": 0.5002987452698665, "acc_norm_stderr,none": 0.004989780520782243}, "mmlu": {"acc,none": 0.2555191568152685, "acc_stderr,none": 0.00367979682577026, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.26482465462274174, "acc_stderr,none": 0.006436132881864303, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.03619604524124249}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03346409881055953}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.030964517926923413}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.02875679962965834}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.040261875275912046}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507437}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3128834355828221, "acc_stderr,none": 0.036429145782924055}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.26366559485530544, "acc_stderr,none": 0.02502553850053234}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2839506172839506, "acc_stderr,none": 0.02508947852376513}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26597131681877445, "acc_stderr,none": 0.011285033165551283}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824563}, "mmlu_other": {"acc,none": 0.24943675571290633, "acc_stderr,none": 0.007758008335959121, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036844}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27169811320754716, "acc_stderr,none": 0.027377706624670713}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.0321473730202947}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.22869955156950672, "acc_stderr,none": 0.028188240046929193}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822584}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.27330779054916987, "acc_stderr,none": 0.015936681062628553}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.025360603796242557}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25886524822695034, "acc_stderr,none": 0.026129572527180848}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.023157468308559366}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.23493975903614459, "acc_stderr,none": 0.03300533186128922}, "mmlu_social_sciences": {"acc,none": 0.2479688007799805, "acc_stderr,none": 0.007780320854271593, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518753}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2828282828282828, "acc_stderr,none": 0.03208779558786752}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803638}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21794871794871795, "acc_stderr,none": 0.020932445774463192}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176892}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23486238532110093, "acc_stderr,none": 0.018175110510343585}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.1984732824427481, "acc_stderr,none": 0.034981493854624734}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.01843342764940191}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.040139645540727714}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24081632653061225, "acc_stderr,none": 0.027372942201788167}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.030965903123573037}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.25499524262607043, "acc_stderr,none": 0.007760990143027173, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206824}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.03885004245800254}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.0378272898086547}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.15, "acc_stderr,none": 0.0358870281282637}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.22127659574468084, "acc_stderr,none": 0.02713634960242406}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.30344827586206896, "acc_stderr,none": 0.038312260488503336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.02286083830923207}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23548387096774193, "acc_stderr,none": 0.024137632429337707}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2315270935960591, "acc_stderr,none": 0.029678333141444455}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.027195934804085626}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.027467401804057993}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.04246624336697624}, "mmlu_pro": {"exact_match,custom-extract": 0.09142287234042554, "exact_match_stderr,custom-extract": 0.002616816663912114, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.10320781032078104, "exact_match_stderr,custom-extract": 0.011369612924724335}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11787072243346007, "exact_match_stderr,custom-extract": 0.011486983100199012}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.03975265017667844, "exact_match_stderr,custom-extract": 0.005809560779109003}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09268292682926829, "exact_match_stderr,custom-extract": 0.014338963443185472}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12085308056872038, "exact_match_stderr,custom-extract": 0.011226536807498492}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07327141382868937, "exact_match_stderr,custom-extract": 0.008375406351572436}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12102689486552567, "exact_match_stderr,custom-extract": 0.011410842488489005}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10236220472440945, "exact_match_stderr,custom-extract": 0.015549935163883123}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07175295186194369, "exact_match_stderr,custom-extract": 0.007781356843650032}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06069578090303479, "exact_match_stderr,custom-extract": 0.006498535623285848}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11363636363636363, "exact_match_stderr,custom-extract": 0.010446330904021006}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12024048096192384, "exact_match_stderr,custom-extract": 0.014574466566661969}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08468052347959969, "exact_match_stderr,custom-extract": 0.007727531295642343}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12907268170426064, "exact_match_stderr,custom-extract": 0.01187623992295407}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.224, "acc_stderr,none": 0.01866399446471079, "acc_norm,none": 0.338, "acc_norm_stderr,none": 0.02117566569520941}, "piqa": {"alias": "piqa", "acc,none": 0.6675734494015234, "acc_stderr,none": 0.010991141557445594, "acc_norm,none": 0.6746463547334058, "acc_norm_stderr,none": 0.010931036623525193}, "race": {"alias": "race", "acc,none": 0.3473684210526316, "acc_stderr,none": 0.014735977850381395}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41146366427840325, "acc_stderr,none": 0.011135283116540199}, "winogrande": {"alias": "winogrande", "acc,none": 0.574585635359116, "acc_stderr,none": 0.013895257666646383}} {"created_at": "2025-04-21T08:30:38.170356", "global_step": 180000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3003412969283277, "acc_stderr,none": 0.01339590930995699, "acc_norm,none": 0.3319112627986348, "acc_norm_stderr,none": 0.013760988200880536}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6254208754208754, "acc_stderr,none": 0.009931758820410619, "acc_norm,none": 0.6005892255892256, "acc_norm_stderr,none": 0.010050018228742118}, "boolq": {"alias": "boolq", "acc,none": 0.6685015290519878, "acc_stderr,none": 0.00823350032457152}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18673218673218672, "acc_stderr,none": 0.011156975219176355}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3915554670384386, "acc_stderr,none": 0.004871005939407468, "acc_norm,none": 0.5103565026887075, "acc_norm_stderr,none": 0.004988710917169333}, "mmlu": {"acc,none": 0.25402364335564737, "acc_stderr,none": 0.003666959296353058, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25802337938363445, "acc_stderr,none": 0.006371653454261479, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.03455071019102146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.03192271569548299}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22058823529411764, "acc_stderr,none": 0.029102254389674082}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3305785123966942, "acc_stderr,none": 0.04294340845212095}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.041331194402438376}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26993865030674846, "acc_stderr,none": 0.034878251684978906}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.02317629820399201}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22186495176848875, "acc_stderr,none": 0.023598858292863047}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.02465968518596729}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2737940026075619, "acc_stderr,none": 0.011388612167979388}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3508771929824561, "acc_stderr,none": 0.03660298834049162}, "mmlu_other": {"acc,none": 0.2603797875764403, "acc_stderr,none": 0.00785768687101813, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891363}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.032147373020294696}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.20388349514563106, "acc_stderr,none": 0.0398913985953177}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3076923076923077, "acc_stderr,none": 0.030236389942173092}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.015671006009339586}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.02355083135199509}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.0258921511567094}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22426470588235295, "acc_stderr,none": 0.02533684856333236}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.30120481927710846, "acc_stderr,none": 0.03571609230053481}, "mmlu_social_sciences": {"acc,none": 0.24601884952876177, "acc_stderr,none": 0.007741765189157665, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.04142439719489362}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.15151515151515152, "acc_stderr,none": 0.025545650426603606}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24352331606217617, "acc_stderr,none": 0.03097543638684542}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551965}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25210084033613445, "acc_stderr,none": 0.028205545033277726}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22385321100917432, "acc_stderr,none": 0.017871217767790215}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.0359546161177469}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.018635594034423983}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644286}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2653061224489796, "acc_stderr,none": 0.028263889943784606}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.03014777593540922}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_stem": {"acc,none": 0.2496035521725341, "acc_stderr,none": 0.0076980860773853636, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.037857144650666544}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2236842105263158, "acc_stderr,none": 0.033911609343436025}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.03514697467862388}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.029241883869628848}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.03664666337225256}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.022418042891113946}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24193548387096775, "acc_stderr,none": 0.024362599693031086}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22660098522167488, "acc_stderr,none": 0.02945486383529296}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.025644108639267638}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2980132450331126, "acc_stderr,none": 0.037345356767871984}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.17592592592592593, "acc_stderr,none": 0.025967420958258533}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.36607142857142855, "acc_stderr,none": 0.0457237235873743}, "mmlu_pro": {"exact_match,custom-extract": 0.09084109042553191, "exact_match_stderr,custom-extract": 0.002606931645309282, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11436541143654114, "exact_match_stderr,custom-extract": 0.011893731278281038}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10012674271229405, "exact_match_stderr,custom-extract": 0.010693074879962135}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.03356890459363958, "exact_match_stderr,custom-extract": 0.0053557800104931256}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11219512195121951, "exact_match_stderr,custom-extract": 0.015605730293675795}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11255924170616113, "exact_match_stderr,custom-extract": 0.010885452262179492}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07739938080495357, "exact_match_stderr,custom-extract": 0.008588907694718309}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12469437652811736, "exact_match_stderr,custom-extract": 0.011558254824072236}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.017173163625244684}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.061762034514078114, "exact_match_stderr,custom-extract": 0.007258066710204822}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06291635825314582, "exact_match_stderr,custom-extract": 0.006608518078813442}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1038961038961039, "exact_match_stderr,custom-extract": 0.010043335327351773}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12424849699398798, "exact_match_stderr,custom-extract": 0.014781596611020452}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08160123171670516, "exact_match_stderr,custom-extract": 0.007598478818397698}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13784461152882205, "exact_match_stderr,custom-extract": 0.012211204647685924}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.242, "acc_stderr,none": 0.01917308567833717, "acc_norm,none": 0.358, "acc_norm_stderr,none": 0.02146143486285912}, "piqa": {"alias": "piqa", "acc,none": 0.6556039173014145, "acc_stderr,none": 0.011086521237125625, "acc_norm,none": 0.6664853101196954, "acc_norm_stderr,none": 0.011000139592184578}, "race": {"alias": "race", "acc,none": 0.3416267942583732, "acc_stderr,none": 0.01467782777076108}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41453428863868985, "acc_stderr,none": 0.01114756056703673}, "winogrande": {"alias": "winogrande", "acc,none": 0.6037884767166535, "acc_stderr,none": 0.013746404157154954}} {"created_at": "2025-04-21T10:10:38.854934", "global_step": 182000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3267918088737201, "acc_stderr,none": 0.013706665975587336, "acc_norm,none": 0.3720136518771331, "acc_norm_stderr,none": 0.014124597881844461}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6334175084175084, "acc_stderr,none": 0.009887786585323962, "acc_norm,none": 0.6212121212121212, "acc_norm_stderr,none": 0.009953737656542038}, "boolq": {"alias": "boolq", "acc,none": 0.6155963302752293, "acc_stderr,none": 0.008508133844703914}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1981981981981982, "acc_stderr,none": 0.011413095456219316}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.38936466839275047, "acc_stderr,none": 0.00486609688094144, "acc_norm,none": 0.513343955387373, "acc_norm_stderr,none": 0.004988004122536528}, "mmlu": {"acc,none": 0.24953710297678394, "acc_stderr,none": 0.0036502520677002075, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2675876726886291, "acc_stderr,none": 0.006459905109553025, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.31746031746031744, "acc_stderr,none": 0.04163453031302859}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139404}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.030964517926923403}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2911392405063291, "acc_stderr,none": 0.029571601065753374}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04065578140908705}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.03351953879521269}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28034682080924855, "acc_stderr,none": 0.02418242749657761}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2670391061452514, "acc_stderr,none": 0.014796502622562551}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24758842443729903, "acc_stderr,none": 0.024513879973621967}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25617283950617287, "acc_stderr,none": 0.0242885336377261}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26792698826597133, "acc_stderr,none": 0.011311347690633874}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.27485380116959063, "acc_stderr,none": 0.03424042924691584}, "mmlu_other": {"acc,none": 0.251689732861281, "acc_stderr,none": 0.007785398757433294, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2679245283018868, "acc_stderr,none": 0.02725726032249485}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.30057803468208094, "acc_stderr,none": 0.03496101481191181}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3004484304932735, "acc_stderr,none": 0.030769352008229132}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646035}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.02891120880274948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24265644955300128, "acc_stderr,none": 0.01532988894089987}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023805186524888146}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.025892151156709405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20220588235294118, "acc_stderr,none": 0.024398192986654924}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.034843315926805875}, "mmlu_social_sciences": {"acc,none": 0.23691907702307444, "acc_stderr,none": 0.0076596140214593, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.038351539543994194}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21717171717171718, "acc_stderr,none": 0.029376616484945633}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21761658031088082, "acc_stderr,none": 0.02977866303775296}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.020377660970371386}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.0275536144678638}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23669724770642203, "acc_stderr,none": 0.01822407811729908}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2908496732026144, "acc_stderr,none": 0.018373116915903966}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.02580128347509051}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22388059701492538, "acc_stderr,none": 0.029475250236017183}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.2327941642879797, "acc_stderr,none": 0.0075238823750834, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909281}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.038201699145179055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.033176727875331574}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.037161774375660164}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.1568627450980392, "acc_stderr,none": 0.03618664819936246}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23829787234042554, "acc_stderr,none": 0.027851252973889767}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.037528339580033376}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23225806451612904, "acc_stderr,none": 0.02402225613030824}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.18719211822660098, "acc_stderr,none": 0.027444924966882618}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22592592592592592, "acc_stderr,none": 0.02549753263960955}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763743}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18055555555555555, "acc_stderr,none": 0.02623287897149166}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "mmlu_pro": {"exact_match,custom-extract": 0.09383311170212766, "exact_match_stderr,custom-extract": 0.002650604484023708, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.08647140864714087, "exact_match_stderr,custom-extract": 0.010503664174021156}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10899873257287707, "exact_match_stderr,custom-extract": 0.011101630697795403}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.061837455830388695, "exact_match_stderr,custom-extract": 0.007161987341250204}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.07317073170731707, "exact_match_stderr,custom-extract": 0.012876769299821081}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12796208530805686, "exact_match_stderr,custom-extract": 0.011505210023672562}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0846233230134159, "exact_match_stderr,custom-extract": 0.008945554797547963}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11491442542787286, "exact_match_stderr,custom-extract": 0.011157550931380876}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.01717316362524469}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07175295186194369, "exact_match_stderr,custom-extract": 0.0077813568436500445}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0614359733530718, "exact_match_stderr,custom-extract": 0.006535464195094106}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11796536796536797, "exact_match_stderr,custom-extract": 0.0106174257267992}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.015561893867712513}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.0939183987682833, "exact_match_stderr,custom-extract": 0.00809695357980784}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.10651629072681704, "exact_match_stderr,custom-extract": 0.01092753423409933}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.236, "acc_stderr,none": 0.019008699622084728, "acc_norm,none": 0.338, "acc_norm_stderr,none": 0.02117566569520941}, "piqa": {"alias": "piqa", "acc,none": 0.6632208922742111, "acc_stderr,none": 0.01102673892525118, "acc_norm,none": 0.6621327529923831, "acc_norm_stderr,none": 0.011035474307853845}, "race": {"alias": "race", "acc,none": 0.3521531100478469, "acc_stderr,none": 0.014782629897202259}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41760491299897645, "acc_stderr,none": 0.011159391894922486}, "winogrande": {"alias": "winogrande", "acc,none": 0.6045777426992897, "acc_stderr,none": 0.01374167838754535}} {"created_at": "2025-04-21T11:55:37.152939", "global_step": 184000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.33447098976109213, "acc_stderr,none": 0.013787460322441375, "acc_norm,none": 0.37457337883959047, "acc_norm_stderr,none": 0.014144193471893437}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6447811447811448, "acc_stderr,none": 0.009820245899287127, "acc_norm,none": 0.6224747474747475, "acc_norm_stderr,none": 0.009947227833469432}, "boolq": {"alias": "boolq", "acc,none": 0.6697247706422018, "acc_stderr,none": 0.00822581091427726}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20475020475020475, "acc_stderr,none": 0.011552714477876659}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.046882617226215034}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.391256721768572, "acc_stderr,none": 0.00487034259291505, "acc_norm,none": 0.5089623580959968, "acc_norm_stderr,none": 0.004988979750014427}, "mmlu": {"acc,none": 0.2649195271328871, "acc_stderr,none": 0.003718696913607899, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2548352816153029, "acc_stderr,none": 0.006355300049112161, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.039701582732351734}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.03317505930009179}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.02875679962965834}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19008264462809918, "acc_stderr,none": 0.03581796951709282}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615768}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23410404624277456, "acc_stderr,none": 0.022797110278071128}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.27262569832402234, "acc_stderr,none": 0.014893391735249622}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.26688102893890675, "acc_stderr,none": 0.025122637608816643}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22530864197530864, "acc_stderr,none": 0.02324620264781975}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26140808344198174, "acc_stderr,none": 0.011222528169771314}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03218093795602357}, "mmlu_other": {"acc,none": 0.2491149018345671, "acc_stderr,none": 0.007759233511863085, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2830188679245283, "acc_stderr,none": 0.027724236492700897}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.033450369167889904}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2556053811659193, "acc_stderr,none": 0.02927589100396993}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.044986763205729224}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24786324786324787, "acc_stderr,none": 0.028286324075564397}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.014866821664709607}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.02463004897982478}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25886524822695034, "acc_stderr,none": 0.026129572527180848}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2977941176470588, "acc_stderr,none": 0.027778298701545443}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064536}, "mmlu_social_sciences": {"acc,none": 0.287292817679558, "acc_stderr,none": 0.008148602291518349, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748142}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.29292929292929293, "acc_stderr,none": 0.032424979581788145}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.29015544041450775, "acc_stderr,none": 0.032752644677915166}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3076923076923077, "acc_stderr,none": 0.023400928918310488}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.029953823891887044}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3174311926605505, "acc_stderr,none": 0.0199571521984605}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.03641297081313729}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.017282760695167425}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.041220665028782855}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.33877551020408164, "acc_stderr,none": 0.03029950656215418}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.30845771144278605, "acc_stderr,none": 0.03265819588512699}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542129}, "mmlu_stem": {"acc,none": 0.27370758008246115, "acc_stderr,none": 0.007914865360243764, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.03712537833614866}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.25, "acc_stderr,none": 0.03523807393012047}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03476590104304134}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542129}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.04755129616062947}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536934}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28936170212765955, "acc_stderr,none": 0.02964400657700962}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.02193587808118476}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.29354838709677417, "acc_stderr,none": 0.025906087021319295}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3054187192118227, "acc_stderr,none": 0.03240661565868408}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.02659393910184406}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.304635761589404, "acc_stderr,none": 0.037579499229433426}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.38425925925925924, "acc_stderr,none": 0.03317354514310742}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.20535714285714285, "acc_stderr,none": 0.03834241021419072}, "mmlu_pro": {"exact_match,custom-extract": 0.10006648936170212, "exact_match_stderr,custom-extract": 0.0027280595360777067, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1492329149232915, "exact_match_stderr,custom-extract": 0.013316225455158433}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11280101394169835, "exact_match_stderr,custom-extract": 0.011269480888070116}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0636042402826855, "exact_match_stderr,custom-extract": 0.0072567381353429595}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13170731707317074, "exact_match_stderr,custom-extract": 0.01672154370034764}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11729857819905214, "exact_match_stderr,custom-extract": 0.011082544906439167}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0825593395252838, "exact_match_stderr,custom-extract": 0.008845745053999918}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13202933985330073, "exact_match_stderr,custom-extract": 0.011843408801240313}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09186351706036745, "exact_match_stderr,custom-extract": 0.01481682998393401}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.0971843778383288, "exact_match_stderr,custom-extract": 0.008931027353227365}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07327905255366396, "exact_match_stderr,custom-extract": 0.007092470342788471}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11796536796536797, "exact_match_stderr,custom-extract": 0.010617425726799202}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10020040080160321, "exact_match_stderr,custom-extract": 0.013455286690416086}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07775211701308699, "exact_match_stderr,custom-extract": 0.007432631448995875}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11779448621553884, "exact_match_stderr,custom-extract": 0.011418740524805808}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.248, "acc_stderr,none": 0.019332342821239107, "acc_norm,none": 0.36, "acc_norm_stderr,none": 0.02148775108972052}, "piqa": {"alias": "piqa", "acc,none": 0.676278563656148, "acc_stderr,none": 0.01091676501070876, "acc_norm,none": 0.6838955386289445, "acc_norm_stderr,none": 0.010848148455700443}, "race": {"alias": "race", "acc,none": 0.33301435406698565, "acc_stderr,none": 0.01458610955684029}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41760491299897645, "acc_stderr,none": 0.011159391894922484}, "winogrande": {"alias": "winogrande", "acc,none": 0.5990528808208366, "acc_stderr,none": 0.013773974554948026}} {"created_at": "2025-04-21T13:35:07.218706", "global_step": 186000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.295221843003413, "acc_stderr,none": 0.013329750293382316, "acc_norm,none": 0.34982935153583616, "acc_norm_stderr,none": 0.01393680921215828}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6359427609427609, "acc_stderr,none": 0.009873293392779123, "acc_norm,none": 0.6203703703703703, "acc_norm_stderr,none": 0.009958037725468568}, "boolq": {"alias": "boolq", "acc,none": 0.5807339449541284, "acc_stderr,none": 0.008630302070999097}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2194922194922195, "acc_stderr,none": 0.011849997754533981}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3879705238000398, "acc_stderr,none": 0.004862919176408078, "acc_norm,none": 0.50318661621191, "acc_norm_stderr,none": 0.004989680072717475}, "mmlu": {"acc,none": 0.25081897165645917, "acc_stderr,none": 0.003655608075829684, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2544102019128587, "acc_stderr,none": 0.006348100227986018, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.03852273364924316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03477691162163659}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.030964517926923403}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.02917868230484256}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3140495867768595, "acc_stderr,none": 0.04236964753041019}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094633}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.17177914110429449, "acc_stderr,none": 0.029634717272371047}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.02335736578587404}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.26145251396648045, "acc_stderr,none": 0.014696599650364553}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2057877813504823, "acc_stderr,none": 0.022961339906764244}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2654320987654321, "acc_stderr,none": 0.024569223600460842}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2516297262059974, "acc_stderr,none": 0.01108327628044191}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.03488647713457922}, "mmlu_other": {"acc,none": 0.2578049565497264, "acc_stderr,none": 0.007844072512102779, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2339622641509434, "acc_stderr,none": 0.02605529690115292}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.03391750322321659}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.03063659134869981}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24358974358974358, "acc_stderr,none": 0.028120966503914394}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.27330779054916987, "acc_stderr,none": 0.015936681062628567}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.19934640522875818, "acc_stderr,none": 0.02287581699346407}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880592}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22058823529411764, "acc_stderr,none": 0.025187786660227245}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3132530120481928, "acc_stderr,none": 0.03610805018031024}, "mmlu_social_sciences": {"acc,none": 0.24861878453038674, "acc_stderr,none": 0.007770686419418447, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537315}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1919191919191919, "acc_stderr,none": 0.028057791672989017}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803627}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.28717948717948716, "acc_stderr,none": 0.022939925418530616}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2815126050420168, "acc_stderr,none": 0.02921354941437216}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22752293577981653, "acc_stderr,none": 0.0179744635787765}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.33587786259541985, "acc_stderr,none": 0.041423137719966634}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2238562091503268, "acc_stderr,none": 0.01686300858541662}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19591836734693877, "acc_stderr,none": 0.025409301953225678}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.29850746268656714, "acc_stderr,none": 0.0323574378935504}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.2407231208372978, "acc_stderr,none": 0.007619563349016937, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.1513157894736842, "acc_stderr,none": 0.029162631596843985}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171453}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.02865917937429232}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.03375672449560554}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.02193587808118475}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24838709677419354, "acc_stderr,none": 0.02458002892148101}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.030108330718011625}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02671924078371216}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.03511807571804726}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.02746740180405801}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.042878587513404544}, "mmlu_pro": {"exact_match,custom-extract": 0.09807180851063829, "exact_match_stderr,custom-extract": 0.002698561382077094, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09762900976290098, "exact_match_stderr,custom-extract": 0.01109239926842664}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11406844106463879, "exact_match_stderr,custom-extract": 0.011324518110214692}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.030918727915194347, "exact_match_stderr,custom-extract": 0.005147064453055622}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08536585365853659, "exact_match_stderr,custom-extract": 0.013816694190586969}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.13744075829383887, "exact_match_stderr,custom-extract": 0.011858737350418433}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09081527347781218, "exact_match_stderr,custom-extract": 0.009235657832562184}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11613691931540342, "exact_match_stderr,custom-extract": 0.011208993552473897}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12073490813648294, "exact_match_stderr,custom-extract": 0.016714159620683303}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11080835603996367, "exact_match_stderr,custom-extract": 0.009464280420790103}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05921539600296077, "exact_match_stderr,custom-extract": 0.006423852131454613}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11580086580086581, "exact_match_stderr,custom-extract": 0.010532466716077538}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13226452905811623, "exact_match_stderr,custom-extract": 0.015181011139551245}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09545804464973057, "exact_match_stderr,custom-extract": 0.008156113834567349}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13283208020050125, "exact_match_stderr,custom-extract": 0.012021922607242956}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.228, "acc_stderr,none": 0.018781306529363204, "acc_norm,none": 0.348, "acc_norm_stderr,none": 0.0213237286328075}, "piqa": {"alias": "piqa", "acc,none": 0.6545157780195865, "acc_stderr,none": 0.011094802893617764, "acc_norm,none": 0.6708378672470077, "acc_norm_stderr,none": 0.0109637504141347}, "race": {"alias": "race", "acc,none": 0.3521531100478469, "acc_stderr,none": 0.014782629897202257}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.3955987717502559, "acc_stderr,none": 0.011064683986236569}, "winogrande": {"alias": "winogrande", "acc,none": 0.5619573796369376, "acc_stderr,none": 0.013944181296470803}} {"created_at": "2025-04-21T15:44:57.415760", "global_step": 188000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.295221843003413, "acc_stderr,none": 0.013329750293382316, "acc_norm,none": 0.35494880546075086, "acc_norm_stderr,none": 0.013983036904094092}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6132154882154882, "acc_stderr,none": 0.009993308355370968, "acc_norm,none": 0.5993265993265994, "acc_norm_stderr,none": 0.010055304474255573}, "boolq": {"alias": "boolq", "acc,none": 0.6654434250764526, "acc_stderr,none": 0.00825244984527319}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21621621621621623, "acc_stderr,none": 0.011785889175486643}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.044619604333847394}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3888667596096395, "acc_stderr,none": 0.004864966792310696, "acc_norm,none": 0.5060744871539534, "acc_norm_stderr,none": 0.0049894131580348}, "mmlu": {"acc,none": 0.26021934197407776, "acc_stderr,none": 0.003701249118108315, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25398512221041447, "acc_stderr,none": 0.00634760746134331, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24050632911392406, "acc_stderr,none": 0.027820781981149685}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2809917355371901, "acc_stderr,none": 0.041032038305145124}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252627}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.03192193448934722}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.02289408248992599}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2282958199356913, "acc_stderr,none": 0.02383930331139821}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27249022164276404, "acc_stderr,none": 0.01137165829431153}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30409356725146197, "acc_stderr,none": 0.03528211258245231}, "mmlu_other": {"acc,none": 0.25458641776633406, "acc_stderr,none": 0.0078074732977840445, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.02528839450289137}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818317}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.24663677130044842, "acc_stderr,none": 0.028930413120910884}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.044532548363264673}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02934311479809447}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939098}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24776500638569604, "acc_stderr,none": 0.015438083080568965}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.024630048979824775}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.30141843971631205, "acc_stderr,none": 0.027374128882631153}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19852941176470587, "acc_stderr,none": 0.024231013370541097}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064537}, "mmlu_social_sciences": {"acc,none": 0.26259343516412087, "acc_stderr,none": 0.007931709073349157, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.0383515395439942}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2878787878787879, "acc_stderr,none": 0.03225883512300992}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.30569948186528495, "acc_stderr,none": 0.033248379397581594}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2512820512820513, "acc_stderr,none": 0.02199201666237057}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.29831932773109243, "acc_stderr,none": 0.02971914287634286}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22201834862385322, "acc_stderr,none": 0.017818849564796627}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.03880848301082396}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2565359477124183, "acc_stderr,none": 0.01766784161237899}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2897959183673469, "acc_stderr,none": 0.02904308868330433}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2736318407960199, "acc_stderr,none": 0.03152439186555402}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.272756105296543, "acc_stderr,none": 0.007930242823249674, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.03885004245800253}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.0355418036802569}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03800968060554858}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.04533838195929775}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2425531914893617, "acc_stderr,none": 0.028020226271200217}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.022717467897708604}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2870967741935484, "acc_stderr,none": 0.025736542745594525}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.028501378167893946}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844065}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.3443708609271523, "acc_stderr,none": 0.038796870240733264}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2175925925925926, "acc_stderr,none": 0.02813968944485968}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.042466243366976235}, "mmlu_pro": {"exact_match,custom-extract": 0.10231050531914894, "exact_match_stderr,custom-extract": 0.002759551257515907, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09902370990237098, "exact_match_stderr,custom-extract": 0.011162713195868411}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09632446134347275, "exact_match_stderr,custom-extract": 0.010510211344233732}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0627208480565371, "exact_match_stderr,custom-extract": 0.007209566250015045}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1, "exact_match_stderr,custom-extract": 0.014834045293024473}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1362559241706161, "exact_match_stderr,custom-extract": 0.011815618235249946}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10526315789473684, "exact_match_stderr,custom-extract": 0.00986388905650164}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11124694376528117, "exact_match_stderr,custom-extract": 0.011000782283753618}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11286089238845144, "exact_match_stderr,custom-extract": 0.01623214090346145}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.0971843778383288, "exact_match_stderr,custom-extract": 0.008931027353227372}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08956328645447817, "exact_match_stderr,custom-extract": 0.007771824904813092}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11580086580086581, "exact_match_stderr,custom-extract": 0.010532466716077543}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11222444889779559, "exact_match_stderr,custom-extract": 0.01414427396080389}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09853733641262509, "exact_match_stderr,custom-extract": 0.00827250303290227}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12781954887218044, "exact_match_stderr,custom-extract": 0.011826947082307299}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.234, "acc_stderr,none": 0.018952741564893683, "acc_norm,none": 0.348, "acc_norm_stderr,none": 0.0213237286328075}, "piqa": {"alias": "piqa", "acc,none": 0.6730141458106638, "acc_stderr,none": 0.010945157126978225, "acc_norm,none": 0.6806311207834603, "acc_norm_stderr,none": 0.010877964076613742}, "race": {"alias": "race", "acc,none": 0.3311004784688995, "acc_stderr,none": 0.014564986871061022}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41299897645854655, "acc_stderr,none": 0.011141477698035238}, "winogrande": {"alias": "winogrande", "acc,none": 0.6006314127861089, "acc_stderr,none": 0.013764933546717614}} {"created_at": "2025-04-21T17:36:37.762934", "global_step": 190000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3037542662116041, "acc_stderr,none": 0.013438909184778759, "acc_norm,none": 0.36006825938566556, "acc_norm_stderr,none": 0.01402751681458519}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.622895622895623, "acc_stderr,none": 0.00994504194636652, "acc_norm,none": 0.5871212121212122, "acc_norm_stderr,none": 0.010102837421104667}, "boolq": {"alias": "boolq", "acc,none": 0.637308868501529, "acc_stderr,none": 0.008408838061823172}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20884520884520885, "acc_stderr,none": 0.011637590576063062}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720683}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3906592312288389, "acc_stderr,none": 0.004869010152280755, "acc_norm,none": 0.5076677952599083, "acc_norm_stderr,none": 0.004989194627707848}, "mmlu": {"acc,none": 0.26570289132602193, "acc_stderr,none": 0.003725862522809479, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2624867162592986, "acc_stderr,none": 0.006413383506026254, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0404061017820884}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03346409881055953}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.03166009679399813}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22784810126582278, "acc_stderr,none": 0.027303484599069405}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.371900826446281, "acc_stderr,none": 0.04412015806624503}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.03192193448934723}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.023948512905468348}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2572347266881029, "acc_stderr,none": 0.024826171289250888}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2623456790123457, "acc_stderr,none": 0.024477222856135114}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27053455019556716, "acc_stderr,none": 0.01134599674353926}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.26900584795321636, "acc_stderr,none": 0.0340105262010409}, "mmlu_other": {"acc,none": 0.2587705181847441, "acc_stderr,none": 0.007862460104473081, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2679245283018868, "acc_stderr,none": 0.027257260322494845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.033450369167889925}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.27802690582959644, "acc_stderr,none": 0.03006958487449405}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822585}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2656449553001277, "acc_stderr,none": 0.015794302487888722}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.025553169991826514}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340460994}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23161764705882354, "acc_stderr,none": 0.025626533803777565}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21686746987951808, "acc_stderr,none": 0.03208284450356365}, "mmlu_social_sciences": {"acc,none": 0.2596685082872928, "acc_stderr,none": 0.007901972383204057, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2474747474747475, "acc_stderr,none": 0.0307463007421245}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23834196891191708, "acc_stderr,none": 0.030748905363909902}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24871794871794872, "acc_stderr,none": 0.0219169577092138}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.28991596638655465, "acc_stderr,none": 0.029472485833136077}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23669724770642203, "acc_stderr,none": 0.018224078117299106}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728744}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2957516339869281, "acc_stderr,none": 0.018463154132632806}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.040693063197213754}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.31020408163265306, "acc_stderr,none": 0.02961345987248438}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22388059701492538, "acc_stderr,none": 0.029475250236017176}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_stem": {"acc,none": 0.28322232794164287, "acc_stderr,none": 0.00801608720341244, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.0391545063041425}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.25, "acc_stderr,none": 0.03523807393012047}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2847222222222222, "acc_stderr,none": 0.03773809990686935}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.225531914893617, "acc_stderr,none": 0.027321078417387536}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.03664666337225256}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2830687830687831, "acc_stderr,none": 0.023201392938194978}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3032258064516129, "acc_stderr,none": 0.026148685930671746}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3448275862068966, "acc_stderr,none": 0.03344283744280458}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844065}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33112582781456956, "acc_stderr,none": 0.038425817186598696}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03214952147802749}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.03952301967702511}, "mmlu_pro": {"exact_match,custom-extract": 0.10779587765957446, "exact_match_stderr,custom-extract": 0.0028203867446061205, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1492329149232915, "exact_match_stderr,custom-extract": 0.013316225455158443}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10899873257287707, "exact_match_stderr,custom-extract": 0.011101630697795389}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07420494699646643, "exact_match_stderr,custom-extract": 0.007793679728569147}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1048780487804878, "exact_match_stderr,custom-extract": 0.015150318019731044}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.13744075829383887, "exact_match_stderr,custom-extract": 0.011858737350418433}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0804953560371517, "exact_match_stderr,custom-extract": 0.008744292925925115}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1271393643031785, "exact_match_stderr,custom-extract": 0.011654709248697779}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11548556430446194, "exact_match_stderr,custom-extract": 0.01639549430578107}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10172570390554042, "exact_match_stderr,custom-extract": 0.009114303697059907}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08364174685418209, "exact_match_stderr,custom-extract": 0.007534896840571092}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10606060606060606, "exact_match_stderr,custom-extract": 0.010135151380336863}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12625250501002003, "exact_match_stderr,custom-extract": 0.014883268009546972}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09853733641262509, "exact_match_stderr,custom-extract": 0.008272503032902268}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15162907268170425, "exact_match_stderr,custom-extract": 0.012704423645915383}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.23, "acc_stderr,none": 0.018839050391123147, "acc_norm,none": 0.33, "acc_norm_stderr,none": 0.021049612166134806}, "piqa": {"alias": "piqa", "acc,none": 0.6632208922742111, "acc_stderr,none": 0.011026738925251179, "acc_norm,none": 0.6675734494015234, "acc_norm_stderr,none": 0.01099114155744559}, "race": {"alias": "race", "acc,none": 0.34545454545454546, "acc_stderr,none": 0.014716858425461329}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4155578300921187, "acc_stderr,none": 0.011151553840954863}, "winogrande": {"alias": "winogrande", "acc,none": 0.5919494869771112, "acc_stderr,none": 0.013812822643745021}} {"created_at": "2025-04-21T21:01:48.476200", "global_step": 194000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3054607508532423, "acc_stderr,none": 0.0134600804780025, "acc_norm,none": 0.34215017064846415, "acc_norm_stderr,none": 0.013864152159177282}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6258417508417509, "acc_stderr,none": 0.009929516948977625, "acc_norm,none": 0.5913299663299664, "acc_norm_stderr,none": 0.010087174498762883}, "boolq": {"alias": "boolq", "acc,none": 0.6691131498470948, "acc_stderr,none": 0.008229663469949957}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091197}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.045126085985421276}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3877713602867955, "acc_stderr,none": 0.004862461799370396, "acc_norm,none": 0.5099581756622187, "acc_norm_stderr,none": 0.004988791687322848}, "mmlu": {"acc,none": 0.25523429710867396, "acc_stderr,none": 0.00367414089399553, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2716259298618491, "acc_stderr,none": 0.00647773071518646, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.15079365079365079, "acc_stderr,none": 0.03200686497287394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.0347769116216366}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27941176470588236, "acc_stderr,none": 0.031493281045079556}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.371900826446281, "acc_stderr,none": 0.04412015806624504}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252628}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.03602511318806771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.29190751445086704, "acc_stderr,none": 0.024476994076247333}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2990353697749196, "acc_stderr,none": 0.02600330111788513}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2932098765432099, "acc_stderr,none": 0.02532988817190092}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.01132873440314031}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.034886477134579215}, "mmlu_other": {"acc,none": 0.24074670099774703, "acc_stderr,none": 0.007655786657848882, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.02544786382510861}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.20179372197309417, "acc_stderr,none": 0.02693611191280227}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822585}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2720306513409962, "acc_stderr,none": 0.015913367447500517}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.0248480182638752}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16911764705882354, "acc_stderr,none": 0.022770868010113025}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.19879518072289157, "acc_stderr,none": 0.03106939026078941}, "mmlu_social_sciences": {"acc,none": 0.23724406889827754, "acc_stderr,none": 0.007672215911654366, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748142}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365907}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.03027690994517826}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2128205128205128, "acc_stderr,none": 0.020752423722128013}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471878}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22201834862385322, "acc_stderr,none": 0.017818849564796624}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2761437908496732, "acc_stderr,none": 0.018087276935663137}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.038950910157241364}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23265306122448978, "acc_stderr,none": 0.02704925791589618}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916718}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.2626070409134158, "acc_stderr,none": 0.007831565605649796, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04072314811876837}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.03738520676119669}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653696}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20425531914893616, "acc_stderr,none": 0.02635515841334943}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25161290322580643, "acc_stderr,none": 0.024685979286239956}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.027920963147993666}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952687}, "mmlu_pro": {"exact_match,custom-extract": 0.09025930851063829, "exact_match_stderr,custom-extract": 0.002595740460993647, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.12412831241283125, "exact_match_stderr,custom-extract": 0.012322509407061307}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09759188846641319, "exact_match_stderr,custom-extract": 0.010571710152486615}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.03180212014134275, "exact_match_stderr,custom-extract": 0.0052176963130371986}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09024390243902439, "exact_match_stderr,custom-extract": 0.014168039768581511}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1386255924170616, "exact_match_stderr,custom-extract": 0.011901560328343764}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.05985552115583075, "exact_match_stderr,custom-extract": 0.007624506334310255}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11613691931540342, "exact_match_stderr,custom-extract": 0.011208993552473897}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11548556430446194, "exact_match_stderr,custom-extract": 0.016395494305781064}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09990917347865577, "exact_match_stderr,custom-extract": 0.009041687821794691}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06439674315321983, "exact_match_stderr,custom-extract": 0.006680530175483019}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1277056277056277, "exact_match_stderr,custom-extract": 0.01098590155110224}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12224448897795591, "exact_match_stderr,custom-extract": 0.014678671649386715}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.050808314087759814, "exact_match_stderr,custom-extract": 0.0060954673419336405}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.11403508771929824, "exact_match_stderr,custom-extract": 0.011258961939273109}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.206, "acc_stderr,none": 0.018104794037333553, "acc_norm,none": 0.318, "acc_norm_stderr,none": 0.020847571620814014}, "piqa": {"alias": "piqa", "acc,none": 0.6572361262241567, "acc_stderr,none": 0.011073978007039314, "acc_norm,none": 0.6719260065288357, "acc_norm_stderr,none": 0.010954487135124213}, "race": {"alias": "race", "acc,none": 0.3550239234449761, "acc_stderr,none": 0.014809839887617086}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.42221084953940635, "acc_stderr,none": 0.011176305491513962}, "winogrande": {"alias": "winogrande", "acc,none": 0.5698500394632992, "acc_stderr,none": 0.013914685094716701}} {"created_at": "2025-04-21T23:03:57.368867", "global_step": 196000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.32337883959044367, "acc_stderr,none": 0.013669421630012137, "acc_norm,none": 0.3626279863481229, "acc_norm_stderr,none": 0.014049106564955009}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6447811447811448, "acc_stderr,none": 0.009820245899287124, "acc_norm,none": 0.6216329966329966, "acc_norm_stderr,none": 0.009951575683331944}, "boolq": {"alias": "boolq", "acc,none": 0.5415902140672783, "acc_stderr,none": 0.008714749017709892}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.011704202814200261}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.045126085985421276}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3890659231228839, "acc_stderr,none": 0.004865419468213882, "acc_norm,none": 0.5063732324238199, "acc_norm_stderr,none": 0.004989376044184163}, "mmlu": {"acc,none": 0.2593647628542943, "acc_stderr,none": 0.0036972539118479172, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24930924548352817, "acc_stderr,none": 0.006308881501166513, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.038522733649243156}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.0328766675860349}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.029771775228145617}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22784810126582278, "acc_stderr,none": 0.027303484599069425}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.17355371900826447, "acc_stderr,none": 0.03457272836917671}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2883435582822086, "acc_stderr,none": 0.035590395316173425}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.02289408248992599}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24916201117318434, "acc_stderr,none": 0.014465893829859926}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2829581993569132, "acc_stderr,none": 0.025583062489984827}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.02378858355165854}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25749674054758803, "acc_stderr,none": 0.01116770601490416}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.031885780176863984}, "mmlu_other": {"acc,none": 0.26778242677824265, "acc_stderr,none": 0.007943205115738913, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.027495663683724057}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.03435568056047874}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816508}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.19730941704035873, "acc_stderr,none": 0.02670985334496796}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.04453254836326469}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02934311479809446}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26947637292464877, "acc_stderr,none": 0.01586624307321505}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.025646863097137908}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.025892151156709405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.027678468642144703}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.23493975903614459, "acc_stderr,none": 0.03300533186128922}, "mmlu_social_sciences": {"acc,none": 0.2593435164120897, "acc_stderr,none": 0.00789060883653011, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.037752050135836386}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23232323232323232, "acc_stderr,none": 0.030088629490217487}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3471502590673575, "acc_stderr,none": 0.03435696168361355}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23076923076923078, "acc_stderr,none": 0.02136202772522273}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2605042016806723, "acc_stderr,none": 0.028510251512341926}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26238532110091745, "acc_stderr,none": 0.018861885021534738}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.035954616117746904}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.01755581809132226}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940588}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879815}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3034825870646766, "acc_stderr,none": 0.03251006816458616}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_stem": {"acc,none": 0.2660957817951158, "acc_stderr,none": 0.007870017848973341, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.03885004245800254}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137283}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542126}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.0285048564705142}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.03664666337225256}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23544973544973544, "acc_stderr,none": 0.021851509822031715}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3, "acc_stderr,none": 0.02606936229533514}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03144712581678242}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.027195934804085626}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2119205298013245, "acc_stderr,none": 0.03336767086567977}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3194444444444444, "acc_stderr,none": 0.0317987634217685}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.0929188829787234, "exact_match_stderr,custom-extract": 0.002629905694758259, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13110181311018132, "exact_match_stderr,custom-extract": 0.012613403336459909}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09885931558935361, "exact_match_stderr,custom-extract": 0.010632661544075477}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.036219081272084806, "exact_match_stderr,custom-extract": 0.005555543779666385}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08780487804878048, "exact_match_stderr,custom-extract": 0.013993989404782777}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1504739336492891, "exact_match_stderr,custom-extract": 0.012314171688222528}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.06914344685242518, "exact_match_stderr,custom-extract": 0.008154159728499578}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12224938875305623, "exact_match_stderr,custom-extract": 0.011460350236485606}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12598425196850394, "exact_match_stderr,custom-extract": 0.017022602638569542}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06721162579473206, "exact_match_stderr,custom-extract": 0.007549486626300625}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05551443375277572, "exact_match_stderr,custom-extract": 0.006232090274026651}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11580086580086581, "exact_match_stderr,custom-extract": 0.010532466716077542}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12224448897795591, "exact_match_stderr,custom-extract": 0.014678671649386722}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08160123171670516, "exact_match_stderr,custom-extract": 0.007598478818397754}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13032581453634084, "exact_match_stderr,custom-extract": 0.011925163793228671}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.248, "acc_stderr,none": 0.019332342821239103, "acc_norm,none": 0.356, "acc_norm_stderr,none": 0.02143471235607264}, "piqa": {"alias": "piqa", "acc,none": 0.6697497279651795, "acc_stderr,none": 0.010972947133006299, "acc_norm,none": 0.6626768226332971, "acc_norm_stderr,none": 0.011031114785059694}, "race": {"alias": "race", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.01488799043759141}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40276356192425794, "acc_stderr,none": 0.01109806114337135}, "winogrande": {"alias": "winogrande", "acc,none": 0.5872138910812944, "acc_stderr,none": 0.01383706064868208}} {"created_at": "2025-04-22T00:29:17.259067", "global_step": 198000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.318259385665529, "acc_stderr,none": 0.013611993916971451, "acc_norm,none": 0.3677474402730375, "acc_norm_stderr,none": 0.014090995618168475}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.63510101010101, "acc_stderr,none": 0.009878157021155652, "acc_norm,none": 0.6039562289562289, "acc_norm_stderr,none": 0.010035580962097949}, "boolq": {"alias": "boolq", "acc,none": 0.6281345565749236, "acc_stderr,none": 0.00845301800735403}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20966420966420968, "acc_stderr,none": 0.01165435009370464}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3966341366261701, "acc_stderr,none": 0.0048819904876289165, "acc_norm,none": 0.5126468830910177, "acc_norm_stderr,none": 0.00498818498834529}, "mmlu": {"acc,none": 0.24262925509186725, "acc_stderr,none": 0.003613693744937636, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25377258235919237, "acc_stderr,none": 0.0063360321466259546, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.03932537680392869}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.031922715695482995}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.029554292605695053}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.34710743801652894, "acc_stderr,none": 0.04345724570292535}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.02344582627654555}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23765432098765432, "acc_stderr,none": 0.023683591837008557}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27249022164276404, "acc_stderr,none": 0.011371658294311523}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.2513678789829417, "acc_stderr,none": 0.007777182358038267, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.026199808807561925}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.25112107623318386, "acc_stderr,none": 0.029105220833224605}, "mmlu_management": {"alias": " - management", "acc,none": 0.1553398058252427, "acc_stderr,none": 0.03586594738573975}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.02961432369045666}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26309067688378035, "acc_stderr,none": 0.015745497169049057}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.025553169991826538}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.026469036818590624}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1948529411764706, "acc_stderr,none": 0.02406059942348742}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.23139421514462138, "acc_stderr,none": 0.007600832924306099, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518753}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.02985751567338641}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2282051282051282, "acc_stderr,none": 0.021278393863586282}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275886}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.20550458715596331, "acc_stderr,none": 0.017324352325016012}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.017952449196987866}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2, "acc_stderr,none": 0.025607375986579153}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.030147775935409217}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_stem": {"acc,none": 0.22835394862036157, "acc_stderr,none": 0.007473485383253702, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.03820169914517905}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.033176727875331574}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179963}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2723404255319149, "acc_stderr,none": 0.0291012906983867}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727772}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.020940481565334852}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2, "acc_stderr,none": 0.022755204959542932}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2019704433497537, "acc_stderr,none": 0.028247350122180267}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.026202766534652155}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473836}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.17592592592592593, "acc_stderr,none": 0.02596742095825853}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.09765625, "exact_match_stderr,custom-extract": 0.002695370269211375, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13249651324965134, "exact_match_stderr,custom-extract": 0.012670137504949325}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08998732572877059, "exact_match_stderr,custom-extract": 0.010194156217460268}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06890459363957598, "exact_match_stderr,custom-extract": 0.007531645622174551}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1024390243902439, "exact_match_stderr,custom-extract": 0.014993500684238468}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12914691943127962, "exact_match_stderr,custom-extract": 0.01155049734375654}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0804953560371517, "exact_match_stderr,custom-extract": 0.008744292925925086}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1295843520782396, "exact_match_stderr,custom-extract": 0.011749749223850747}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10498687664041995, "exact_match_stderr,custom-extract": 0.015724991203554552}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08174386920980926, "exact_match_stderr,custom-extract": 0.00826063001429721}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.050333086602516654, "exact_match_stderr,custom-extract": 0.005950391645720973}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11255411255411256, "exact_match_stderr,custom-extract": 0.010402812578120455}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13026052104208416, "exact_match_stderr,custom-extract": 0.015082951205521087}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08775981524249422, "exact_match_stderr,custom-extract": 0.007853533553054279}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14411027568922305, "exact_match_stderr,custom-extract": 0.012440195916530138}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.236, "acc_stderr,none": 0.019008699622084724, "acc_norm,none": 0.338, "acc_norm_stderr,none": 0.02117566569520941}, "piqa": {"alias": "piqa", "acc,none": 0.6697497279651795, "acc_stderr,none": 0.010972947133006294, "acc_norm,none": 0.6686615886833515, "acc_norm_stderr,none": 0.01098207745895735}, "race": {"alias": "race", "acc,none": 0.3464114832535885, "acc_stderr,none": 0.014726451021782803}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4181166837256909, "acc_stderr,none": 0.011161320510270642}, "winogrande": {"alias": "winogrande", "acc,none": 0.585635359116022, "acc_stderr,none": 0.013844846232268563}} {"created_at": "2025-04-22T02:43:56.623873", "global_step": 200000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3395904436860068, "acc_stderr,none": 0.013839039762820167, "acc_norm,none": 0.3728668941979522, "acc_norm_stderr,none": 0.014131176760131158}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6468855218855218, "acc_stderr,none": 0.009807078935467617, "acc_norm,none": 0.622895622895623, "acc_norm_stderr,none": 0.009945041946366513}, "boolq": {"alias": "boolq", "acc,none": 0.6388379204892967, "acc_stderr,none": 0.008401154195242375}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091195}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4001194981079466, "acc_stderr,none": 0.004889210628907955, "acc_norm,none": 0.5193188607847042, "acc_norm_stderr,none": 0.0049860554640443954}, "mmlu": {"acc,none": 0.2545933627688364, "acc_stderr,none": 0.003673847433408553, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2629117959617428, "acc_stderr,none": 0.006414601466146285, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.036196045241242494}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.033464098810559534}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.02933116229425173}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460302}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.33884297520661155, "acc_stderr,none": 0.0432076780753667}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.03893542518824847}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.036230899157241474}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28901734104046245, "acc_stderr,none": 0.024405173935783234}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.28938906752411575, "acc_stderr,none": 0.025755865922632945}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2993827160493827, "acc_stderr,none": 0.02548311560119547}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25684485006518903, "acc_stderr,none": 0.011158455853098843}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.034678266857038266}, "mmlu_other": {"acc,none": 0.24267782426778242, "acc_stderr,none": 0.007687406272717251, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.02544786382510861}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21076233183856502, "acc_stderr,none": 0.027373095500540186}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23931623931623933, "acc_stderr,none": 0.027951826808924333}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2669220945083014, "acc_stderr,none": 0.015818450894777562}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.025160998214292456}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880592}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21323529411764705, "acc_stderr,none": 0.02488097151229428}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.031417842916639245}, "mmlu_social_sciences": {"acc,none": 0.24179395515112123, "acc_stderr,none": 0.007716248349354236, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.26262626262626265, "acc_stderr,none": 0.03135305009533086}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.029519282616817258}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.020473233173551986}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.027722065493361266}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21651376146788992, "acc_stderr,none": 0.017658710594443135}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596919}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.018054027458815198}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.038950910157241364}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2938775510204082, "acc_stderr,none": 0.029162738410249772}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916714}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_stem": {"acc,none": 0.2664129400570885, "acc_stderr,none": 0.007873383208734878, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04072314811876837}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.0378272898086547}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20425531914893616, "acc_stderr,none": 0.02635515841334943}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24193548387096775, "acc_stderr,none": 0.0243625996930311}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3054187192118227, "acc_stderr,none": 0.03240661565868408}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02988691054762696}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.040598672469526864}, "mmlu_pro": {"exact_match,custom-extract": 0.09807180851063829, "exact_match_stderr,custom-extract": 0.00269463984025853, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.12133891213389121, "exact_match_stderr,custom-extract": 0.012202652228883244}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.13054499366286437, "exact_match_stderr,custom-extract": 0.01200163896868047}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.053003533568904596, "exact_match_stderr,custom-extract": 0.006661856730672948}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08536585365853659, "exact_match_stderr,custom-extract": 0.013816694190586969}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12085308056872038, "exact_match_stderr,custom-extract": 0.011226536807498473}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07223942208462332, "exact_match_stderr,custom-extract": 0.008320844580110063}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14425427872860636, "exact_match_stderr,custom-extract": 0.012292088876836887}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11286089238845144, "exact_match_stderr,custom-extract": 0.01623214090346143}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07720254314259764, "exact_match_stderr,custom-extract": 0.008047716247195571}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05921539600296077, "exact_match_stderr,custom-extract": 0.006423852131454607}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.09956709956709957, "exact_match_stderr,custom-extract": 0.009855592879057058}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13026052104208416, "exact_match_stderr,custom-extract": 0.015082951205521087}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07929176289453425, "exact_match_stderr,custom-extract": 0.007499593169839573}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.17167919799498746, "exact_match_stderr,custom-extract": 0.013357616212456418}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.234, "acc_stderr,none": 0.01895274156489368, "acc_norm,none": 0.346, "acc_norm_stderr,none": 0.021294951277234637}, "piqa": {"alias": "piqa", "acc,none": 0.6697497279651795, "acc_stderr,none": 0.010972947133006294, "acc_norm,none": 0.6806311207834603, "acc_norm_stderr,none": 0.010877964076613747}, "race": {"alias": "race", "acc,none": 0.3684210526315789, "acc_stderr,none": 0.014929174445557296}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4196519959058342, "acc_stderr,none": 0.011167032303390539}, "winogrande": {"alias": "winogrande", "acc,none": 0.5990528808208366, "acc_stderr,none": 0.013773974554948028}} {"created_at": "2025-04-22T04:32:43.395943", "global_step": 202000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3242320819112628, "acc_stderr,none": 0.013678810399518815, "acc_norm,none": 0.3626279863481229, "acc_norm_stderr,none": 0.014049106564955007}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6435185185185185, "acc_stderr,none": 0.009828046544504419, "acc_norm,none": 0.6178451178451179, "acc_norm_stderr,none": 0.009970747281292434}, "boolq": {"alias": "boolq", "acc,none": 0.6519877675840978, "acc_stderr,none": 0.00833123755953539}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.011704202814200274}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3933479386576379, "acc_stderr,none": 0.004874945833947071, "acc_norm,none": 0.5115514837681737, "acc_norm_stderr,none": 0.00498844959300725}, "mmlu": {"acc,none": 0.2595784076342401, "acc_stderr,none": 0.00369281645302572, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.271413390010627, "acc_stderr,none": 0.006477739576261489, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.03455071019102148}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.03453131801885415}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3088235294117647, "acc_stderr,none": 0.03242661719827218}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2911392405063291, "acc_stderr,none": 0.02957160106575337}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3884297520661157, "acc_stderr,none": 0.04449270350068382}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25766871165644173, "acc_stderr,none": 0.03436150827846917}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2861271676300578, "acc_stderr,none": 0.02433214677913413}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2435754189944134, "acc_stderr,none": 0.014355911964767864}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2958199356913183, "acc_stderr,none": 0.025922371788818777}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02438366553103545}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2737940026075619, "acc_stderr,none": 0.011388612167979392}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.27485380116959063, "acc_stderr,none": 0.03424042924691583}, "mmlu_other": {"acc,none": 0.24750563244287094, "acc_stderr,none": 0.007715902489985684, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106737}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.033450369167889925}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21524663677130046, "acc_stderr,none": 0.02758406660220827}, "mmlu_management": {"alias": " - management", "acc,none": 0.1650485436893204, "acc_stderr,none": 0.036756688322331886}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623102}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.29246487867177523, "acc_stderr,none": 0.016267000684598652}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.02495418432487991}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.15073529411764705, "acc_stderr,none": 0.021734235515652844}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.031755547866299194}, "mmlu_social_sciences": {"acc,none": 0.24601884952876177, "acc_stderr,none": 0.0077586117983253575, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.04142439719489361}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.03115626951964684}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2538860103626943, "acc_stderr,none": 0.03141024780565319}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.22564102564102564, "acc_stderr,none": 0.02119363252514854}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275882}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22568807339449543, "acc_stderr,none": 0.01792308766780306}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.272875816993464, "acc_stderr,none": 0.01802047414839358}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724137}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3224489795918367, "acc_stderr,none": 0.029923100563683903}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.21890547263681592, "acc_stderr,none": 0.029239174636647}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_stem": {"acc,none": 0.26704725658103395, "acc_stderr,none": 0.007867401336338222, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.32592592592592595, "acc_stderr,none": 0.040491220417025055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3355263157894737, "acc_stderr,none": 0.038424985593952694}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653696}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171452}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2, "acc_stderr,none": 0.0261488180184245}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.31724137931034485, "acc_stderr,none": 0.03878352372138621}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2830687830687831, "acc_stderr,none": 0.023201392938194978}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2645161290322581, "acc_stderr,none": 0.02509189237885928}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.31527093596059114, "acc_stderr,none": 0.03269080871970186}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712163}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389023}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2175925925925926, "acc_stderr,none": 0.028139689444859672}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "mmlu_pro": {"exact_match,custom-extract": 0.10588430851063829, "exact_match_stderr,custom-extract": 0.002797708654459191, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.14644351464435146, "exact_match_stderr,custom-extract": 0.013212794906829773}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10646387832699619, "exact_match_stderr,custom-extract": 0.010987378600044827}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0706713780918728, "exact_match_stderr,custom-extract": 0.007620353777747184}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08048780487804878, "exact_match_stderr,custom-extract": 0.013451853667809174}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1362559241706161, "exact_match_stderr,custom-extract": 0.011815618235249946}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09803921568627451, "exact_match_stderr,custom-extract": 0.009557758729735875}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1332518337408313, "exact_match_stderr,custom-extract": 0.011889731153818453}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13385826771653545, "exact_match_stderr,custom-extract": 0.017467280079326585}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11080835603996367, "exact_match_stderr,custom-extract": 0.009464280420790077}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06439674315321983, "exact_match_stderr,custom-extract": 0.006680530175483018}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10497835497835498, "exact_match_stderr,custom-extract": 0.010089410685404705}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11823647294589178, "exact_match_stderr,custom-extract": 0.014468953704661763}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10623556581986143, "exact_match_stderr,custom-extract": 0.008552816527360388}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12406015037593984, "exact_match_stderr,custom-extract": 0.011676807835852744}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.23, "acc_stderr,none": 0.01883905039112313, "acc_norm,none": 0.352, "acc_norm_stderr,none": 0.02138004238594604}, "piqa": {"alias": "piqa", "acc,none": 0.6588683351468988, "acc_stderr,none": 0.011061289443962705, "acc_norm,none": 0.6735582154515778, "acc_norm_stderr,none": 0.0109404670461773}, "race": {"alias": "race", "acc,none": 0.35119617224880384, "acc_stderr,none": 0.014773430019036972}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40583418628454454, "acc_stderr,none": 0.011111610832965838}, "winogrande": {"alias": "winogrande", "acc,none": 0.5730071033938438, "acc_stderr,none": 0.013901878072575057}} {"created_at": "2025-04-22T06:13:40.974233", "global_step": 204000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.32081911262798635, "acc_stderr,none": 0.01364094309194653, "acc_norm,none": 0.3583617747440273, "acc_norm_stderr,none": 0.014012883334859868}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6203703703703703, "acc_stderr,none": 0.009958037725468572, "acc_norm,none": 0.6056397306397306, "acc_norm_stderr,none": 0.010028176038393006}, "boolq": {"alias": "boolq", "acc,none": 0.4831804281345566, "acc_stderr,none": 0.00874010565876395}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20393120393120392, "acc_stderr,none": 0.011535521334313653}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.046882617226215034}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3920533758215495, "acc_stderr,none": 0.004872107262082463, "acc_norm,none": 0.5103565026887075, "acc_norm_stderr,none": 0.00498871091716933}, "mmlu": {"acc,none": 0.25751317476143, "acc_stderr,none": 0.0036860223728212653, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.26971307120085014, "acc_stderr,none": 0.006466955353612423, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.038932596106046734}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.03524390844511782}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.032282103870378935}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3884297520661157, "acc_stderr,none": 0.04449270350068382}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.041331194402438376}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26993865030674846, "acc_stderr,none": 0.034878251684978906}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.023786203255508297}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2990353697749196, "acc_stderr,none": 0.02600330111788513}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.27469135802469136, "acc_stderr,none": 0.024836057868294677}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27444589308996087, "acc_stderr,none": 0.011397043163078154}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946956}, "mmlu_other": {"acc,none": 0.2536208561313164, "acc_stderr,none": 0.007790435880892492, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.02590789712240817}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641144}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.35874439461883406, "acc_stderr,none": 0.03219079200419994}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24358974358974358, "acc_stderr,none": 0.028120966503914404}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25798212005108556, "acc_stderr,none": 0.01564583018834895}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.024739981355113596}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.02635806569888059}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.023886881922440345}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.031755547866299194}, "mmlu_social_sciences": {"acc,none": 0.2398440038999025, "acc_stderr,none": 0.007700385106649716, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.029620227874790482}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23316062176165803, "acc_stderr,none": 0.03051611137147601}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25210084033613445, "acc_stderr,none": 0.028205545033277726}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21651376146788992, "acc_stderr,none": 0.01765871059444314}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.0359546161177469}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.017917974069594726}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2653061224489796, "acc_stderr,none": 0.028263889943784606}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.02970528405677245}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_stem": {"acc,none": 0.2603869330796067, "acc_stderr,none": 0.00780681295999734, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.03944624162501117}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351585}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.15, "acc_stderr,none": 0.03588702812826371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.043364327079931785}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342343}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.036001056927277716}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.021935878081184756}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25161290322580643, "acc_stderr,none": 0.024685979286239966}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.0316185633535861}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046944}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952687}, "mmlu_pro": {"exact_match,custom-extract": 0.0945811170212766, "exact_match_stderr,custom-extract": 0.002658628538652858, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.100418410041841, "exact_match_stderr,custom-extract": 0.011232345114394712}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11787072243346007, "exact_match_stderr,custom-extract": 0.011486983100199015}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05123674911660778, "exact_match_stderr,custom-extract": 0.0065559918973592535}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.14390243902439023, "exact_match_stderr,custom-extract": 0.01735537705248126}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.13033175355450238, "exact_match_stderr,custom-extract": 0.01159546417895376}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.06501547987616099, "exact_match_stderr,custom-extract": 0.00792451912485295}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11613691931540342, "exact_match_stderr,custom-extract": 0.011208993552473897}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09448818897637795, "exact_match_stderr,custom-extract": 0.0150052772401423}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07538601271571299, "exact_match_stderr,custom-extract": 0.007960297036631305}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08068097705403404, "exact_match_stderr,custom-extract": 0.007412279950922269}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11471861471861472, "exact_match_stderr,custom-extract": 0.010489547712821748}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11422845691382766, "exact_match_stderr,custom-extract": 0.014253888115016489}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07544264819091609, "exact_match_stderr,custom-extract": 0.007330575047540659}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12406015037593984, "exact_match_stderr,custom-extract": 0.011676807835852744}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.214, "acc_stderr,none": 0.01835979750238701, "acc_norm,none": 0.35, "acc_norm_stderr,none": 0.021352091786223108}, "piqa": {"alias": "piqa", "acc,none": 0.6719260065288357, "acc_stderr,none": 0.010954487135124215, "acc_norm,none": 0.6811751904243744, "acc_norm_stderr,none": 0.010873037534333418}, "race": {"alias": "race", "acc,none": 0.33779904306220093, "acc_stderr,none": 0.014637734314782857}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.42067553735926305, "acc_stderr,none": 0.01117077851770561}, "winogrande": {"alias": "winogrande", "acc,none": 0.590370955011839, "acc_stderr,none": 0.013821049109655462}} {"created_at": "2025-04-22T11:49:29.847775", "global_step": 210000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.310580204778157, "acc_stderr,none": 0.013522292098053052, "acc_norm,none": 0.34812286689419797, "acc_norm_stderr,none": 0.013921008595179338}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6283670033670034, "acc_stderr,none": 0.009915897123658797, "acc_norm,none": 0.5879629629629629, "acc_norm_stderr,none": 0.010099765857562764}, "boolq": {"alias": "boolq", "acc,none": 0.6391437308868502, "acc_stderr,none": 0.008399606360769108}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20065520065520065, "acc_stderr,none": 0.011466011466011545}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.04725815626252607}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3911571400119498, "acc_stderr,none": 0.004870121051762735, "acc_norm,none": 0.510655247958574, "acc_norm_stderr,none": 0.0049886482600100335}, "mmlu": {"acc,none": 0.24291411479846176, "acc_stderr,none": 0.003615545836283379, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2454835281615303, "acc_stderr,none": 0.006268397627820697, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04216370213557835}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.03453131801885416}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.03182231867647553}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854933}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.02344582627654555}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.22681564245810057, "acc_stderr,none": 0.014005843570897897}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18971061093247588, "acc_stderr,none": 0.022268196258783228}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.20679012345679013, "acc_stderr,none": 0.022535006705942818}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2529335071707953, "acc_stderr,none": 0.011102268713839987}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.03446296217088426}, "mmlu_other": {"acc,none": 0.25329900225297713, "acc_stderr,none": 0.007780959560728028, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.02590789712240817}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3542600896860987, "acc_stderr,none": 0.032100621541349864}, "mmlu_management": {"alias": " - management", "acc,none": 0.20388349514563106, "acc_stderr,none": 0.03989139859531771}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674057}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2567049808429119, "acc_stderr,none": 0.015620480263064512}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.02417084087934101}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340461004}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.26506024096385544, "acc_stderr,none": 0.03436024037944966}, "mmlu_social_sciences": {"acc,none": 0.23366915827104323, "acc_stderr,none": 0.0076283231249699626, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.30701754385964913, "acc_stderr,none": 0.04339138322579861}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02962022787479048}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.030276909945178256}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2128205128205128, "acc_stderr,none": 0.020752423722128006}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275886}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22568807339449543, "acc_stderr,none": 0.017923087667803053}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.1984732824427481, "acc_stderr,none": 0.03498149385462475}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.017593486895366835}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644286}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20816326530612245, "acc_stderr,none": 0.025991117672813296}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916707}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_stem": {"acc,none": 0.23786869647954328, "acc_stderr,none": 0.007587282643014663, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.03502553170678317}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2236842105263158, "acc_stderr,none": 0.03391160934343602}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.04440521906179325}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.029379170464124818}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.0360010569272777}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.022261817692400182}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2, "acc_stderr,none": 0.022755204959542936}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2019704433497537, "acc_stderr,none": 0.02824735012218027}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.02620276653465215}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.026991454502036726}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.038946411200447915}, "mmlu_pro": {"exact_match,custom-extract": 0.1007313829787234, "exact_match_stderr,custom-extract": 0.0027380852934266206, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.09344490934449093, "exact_match_stderr,custom-extract": 0.010877232530238947}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10646387832699619, "exact_match_stderr,custom-extract": 0.010987378600044827}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07950530035335689, "exact_match_stderr,custom-extract": 0.008044098592471984}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08048780487804878, "exact_match_stderr,custom-extract": 0.013451853667809174}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12559241706161137, "exact_match_stderr,custom-extract": 0.011413658642349252}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09081527347781218, "exact_match_stderr,custom-extract": 0.009235657832562182}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12102689486552567, "exact_match_stderr,custom-extract": 0.011410842488489002}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11548556430446194, "exact_match_stderr,custom-extract": 0.016395494305781105}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10172570390554042, "exact_match_stderr,custom-extract": 0.009114303697059909}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05921539600296077, "exact_match_stderr,custom-extract": 0.0064238521314546005}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11363636363636363, "exact_match_stderr,custom-extract": 0.010446330904020987}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12224448897795591, "exact_match_stderr,custom-extract": 0.01467867164938673}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10007698229407236, "exact_match_stderr,custom-extract": 0.00832975896214758}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14160401002506265, "exact_match_stderr,custom-extract": 0.012349587610134355}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.182, "acc_stderr,none": 0.01727277329773045, "acc_norm,none": 0.318, "acc_norm_stderr,none": 0.02084757162081401}, "piqa": {"alias": "piqa", "acc,none": 0.6632208922742111, "acc_stderr,none": 0.011026738925251179, "acc_norm,none": 0.6708378672470077, "acc_norm_stderr,none": 0.010963750414134702}, "race": {"alias": "race", "acc,none": 0.35406698564593303, "acc_stderr,none": 0.014800834711677318}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.387410440122825, "acc_stderr,none": 0.011023495621289044}, "winogrande": {"alias": "winogrande", "acc,none": 0.5769534333070244, "acc_stderr,none": 0.01388505535905647}} {"created_at": "2025-04-22T13:24:41.157515", "global_step": 212000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.30204778156996587, "acc_stderr,none": 0.013417519144716419, "acc_norm,none": 0.3506825938566553, "acc_norm_stderr,none": 0.013944635930726097}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6296296296296297, "acc_stderr,none": 0.009908978578665757, "acc_norm,none": 0.5909090909090909, "acc_norm_stderr,none": 0.01008877515261578}, "boolq": {"alias": "boolq", "acc,none": 0.6406727828746177, "acc_stderr,none": 0.008391811770406737}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20475020475020475, "acc_stderr,none": 0.011552714477876673}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3963353913563035, "acc_stderr,none": 0.004881359589148995, "acc_norm,none": 0.5135431189006174, "acc_norm_stderr,none": 0.004987950663406527}, "mmlu": {"acc,none": 0.26093149124056403, "acc_stderr,none": 0.0036968502927538266, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2516471838469713, "acc_stderr,none": 0.006319552839980605, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.373015873015873, "acc_stderr,none": 0.04325506042017086}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.03401506715249039}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.03182231867647553}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.035208939510976554}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.039578354719809784}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25766871165644173, "acc_stderr,none": 0.03436150827846917}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.22832369942196531, "acc_stderr,none": 0.02259870380432164}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24022346368715083, "acc_stderr,none": 0.014288343803925307}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2315112540192926, "acc_stderr,none": 0.023956532766639133}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.022779719088733396}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.011328734403140316}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03218093795602357}, "mmlu_other": {"acc,none": 0.2442870936594786, "acc_stderr,none": 0.007703131327012305, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.026199808807561918}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.0332055644308557}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.14, "acc_stderr,none": 0.034873508801977704}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.25112107623318386, "acc_stderr,none": 0.0291052208332246}, "mmlu_management": {"alias": " - management", "acc,none": 0.3300970873786408, "acc_stderr,none": 0.046561471100123514}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674057}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.21966794380587484, "acc_stderr,none": 0.014805384478371179}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.024518195641879334}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.02624492034984301}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.25, "acc_stderr,none": 0.026303648393696036}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.033844291552331346}, "mmlu_social_sciences": {"acc,none": 0.273643158921027, "acc_stderr,none": 0.00800509193816933, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.04096985139843671}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23737373737373738, "acc_stderr,none": 0.03031371053819889}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2849740932642487, "acc_stderr,none": 0.032577140777096614}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3230769230769231, "acc_stderr,none": 0.02371088850197057}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.031041941304059267}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.30642201834862387, "acc_stderr,none": 0.01976551722045852}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.0364129708131373}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.016906615927288135}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.17272727272727273, "acc_stderr,none": 0.03620691833929218}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22448979591836735, "acc_stderr,none": 0.02671143055553842}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31343283582089554, "acc_stderr,none": 0.03280188205348642}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_stem": {"acc,none": 0.2787821122740247, "acc_stderr,none": 0.007968325440911653, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066654}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.03715062154998905}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.4, "acc_stderr,none": 0.04923659639173309}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3431372549019608, "acc_stderr,none": 0.04724007352383887}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816508}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.028809989854102977}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131183}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02306818884826111}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2967741935483871, "acc_stderr,none": 0.025988500792411894}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.03194740072265541}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536975}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.027080372815145668}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2980132450331126, "acc_stderr,none": 0.037345356767871984}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.03191923445686186}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.15178571428571427, "acc_stderr,none": 0.034057028381856924}, "mmlu_pro": {"exact_match,custom-extract": 0.1116190159574468, "exact_match_stderr,custom-extract": 0.0028647572720373257, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.14644351464435146, "exact_match_stderr,custom-extract": 0.0132127949068298}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11787072243346007, "exact_match_stderr,custom-extract": 0.01148698310019901}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07862190812720848, "exact_match_stderr,custom-extract": 0.008003121831736761}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08536585365853659, "exact_match_stderr,custom-extract": 0.013816694190586966}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1552132701421801, "exact_match_stderr,custom-extract": 0.012471657591396923}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1001031991744066, "exact_match_stderr,custom-extract": 0.009646786210097206}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12836185819070906, "exact_match_stderr,custom-extract": 0.011702403876565487}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11023622047244094, "exact_match_stderr,custom-extract": 0.016065998434778177}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09900090826521345, "exact_match_stderr,custom-extract": 0.009005035380672361}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0851221317542561, "exact_match_stderr,custom-extract": 0.007595142426181172}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13095238095238096, "exact_match_stderr,custom-extract": 0.011103953542030809}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12625250501002003, "exact_match_stderr,custom-extract": 0.014883268009546959}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10238645111624327, "exact_match_stderr,custom-extract": 0.008414505495298187}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13157894736842105, "exact_match_stderr,custom-extract": 0.01197372321590041}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.232, "acc_stderr,none": 0.01889619359195204, "acc_norm,none": 0.342, "acc_norm_stderr,none": 0.021236147199899257}, "piqa": {"alias": "piqa", "acc,none": 0.6664853101196954, "acc_stderr,none": 0.011000139592184573, "acc_norm,none": 0.6741022850924918, "acc_norm_stderr,none": 0.010935760218903951}, "race": {"alias": "race", "acc,none": 0.34258373205741627, "acc_stderr,none": 0.01468768473714516}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4022517911975435, "acc_stderr,none": 0.011095758951308073}, "winogrande": {"alias": "winogrande", "acc,none": 0.5887924230465666, "acc_stderr,none": 0.013829128358676859}} {"created_at": "2025-04-22T15:27:41.415847", "global_step": 214000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.31399317406143346, "acc_stderr,none": 0.013562691224726286, "acc_norm,none": 0.33361774744027306, "acc_norm_stderr,none": 0.013778687054176538}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6388888888888888, "acc_stderr,none": 0.00985601342581124, "acc_norm,none": 0.6035353535353535, "acc_norm_stderr,none": 0.010037412763064526}, "boolq": {"alias": "boolq", "acc,none": 0.5877675840978593, "acc_stderr,none": 0.008609271915720914}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202903}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3970324636526588, "acc_stderr,none": 0.004882828727152302, "acc_norm,none": 0.5183230432184823, "acc_norm_stderr,none": 0.0049864298081467705}, "mmlu": {"acc,none": 0.25893747329440253, "acc_stderr,none": 0.003693054721183228, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25674814027630183, "acc_stderr,none": 0.0063700898981528375, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.040406101782088394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.03401506715249039}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.03166009679399811}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460285}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094633}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.02207570925175718}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2737430167597765, "acc_stderr,none": 0.014912413096372434}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24437299035369775, "acc_stderr,none": 0.02440616209466889}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26597131681877445, "acc_stderr,none": 0.011285033165551284}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.03158149539338733}, "mmlu_other": {"acc,none": 0.24750563244287094, "acc_stderr,none": 0.00773550883793023, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.02749566368372406}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3063583815028902, "acc_stderr,none": 0.035149425512674394}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21524663677130046, "acc_stderr,none": 0.02758406660220824}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674057}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2260536398467433, "acc_stderr,none": 0.01495745850433583}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.02440439492808787}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2801418439716312, "acc_stderr,none": 0.026789172351140252}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21323529411764705, "acc_stderr,none": 0.024880971512294268}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.26506024096385544, "acc_stderr,none": 0.03436024037944967}, "mmlu_social_sciences": {"acc,none": 0.2811179720506987, "acc_stderr,none": 0.008091411734560936, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.0414243971948936}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070644}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.31088082901554404, "acc_stderr,none": 0.03340361906276586}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2794871794871795, "acc_stderr,none": 0.022752388839776826}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.33613445378151263, "acc_stderr,none": 0.030684737115135363}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.30825688073394497, "acc_stderr,none": 0.01979836669836726}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.0364129708131373}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2434640522875817, "acc_stderr,none": 0.017362473762146627}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2816326530612245, "acc_stderr,none": 0.02879518557429129}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31343283582089554, "acc_stderr,none": 0.03280188205348643}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_stem": {"acc,none": 0.25182366000634315, "acc_stderr,none": 0.007726040674624342, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137282}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.03514697467862388}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.043898699568087785}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3021276595744681, "acc_stderr,none": 0.030017554471880557}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.03455930201924812}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.021132859182754423}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23548387096774193, "acc_stderr,none": 0.024137632429337707}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2512315270935961, "acc_stderr,none": 0.030516530732694436}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21851851851851853, "acc_stderr,none": 0.025195752251823796}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2980132450331126, "acc_stderr,none": 0.037345356767871984}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3101851851851852, "acc_stderr,none": 0.03154696285656629}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755804}, "mmlu_pro": {"exact_match,custom-extract": 0.10114694148936171, "exact_match_stderr,custom-extract": 0.0027391037901222477, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.14086471408647142, "exact_match_stderr,custom-extract": 0.013000958624507557}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11787072243346007, "exact_match_stderr,custom-extract": 0.01148698310019901}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05830388692579505, "exact_match_stderr,custom-extract": 0.006967433636014823}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11219512195121951, "exact_match_stderr,custom-extract": 0.01560573029367581}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11374407582938388, "exact_match_stderr,custom-extract": 0.010935286894179327}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08565531475748193, "exact_match_stderr,custom-extract": 0.008994860895662222}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.11124694376528117, "exact_match_stderr,custom-extract": 0.011000782283753624}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11023622047244094, "exact_match_stderr,custom-extract": 0.016065998434778166}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09900090826521345, "exact_match_stderr,custom-extract": 0.009005035380672361}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06513693560325684, "exact_match_stderr,custom-extract": 0.006716156044746713}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1266233766233766, "exact_match_stderr,custom-extract": 0.010946036109831504}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13026052104208416, "exact_match_stderr,custom-extract": 0.015082951205521085}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07929176289453425, "exact_match_stderr,custom-extract": 0.007499593169839569}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14661654135338345, "exact_match_stderr,custom-extract": 0.012529520031289975}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.224, "acc_stderr,none": 0.018663994464710797, "acc_norm,none": 0.348, "acc_norm_stderr,none": 0.0213237286328075}, "piqa": {"alias": "piqa", "acc,none": 0.6697497279651795, "acc_stderr,none": 0.010972947133006292, "acc_norm,none": 0.6648531011969532, "acc_norm_stderr,none": 0.011013513128643933}, "race": {"alias": "race", "acc,none": 0.35119617224880384, "acc_stderr,none": 0.014773430019036972}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4104401228249744, "acc_stderr,none": 0.011131091241082379}, "winogrande": {"alias": "winogrande", "acc,none": 0.5966850828729282, "acc_stderr,none": 0.013787257285896236}} {"created_at": "2025-04-22T17:06:56.710051", "global_step": 216000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.30204778156996587, "acc_stderr,none": 0.013417519144716417, "acc_norm,none": 0.3515358361774744, "acc_norm_stderr,none": 0.013952413699600938}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6127946127946128, "acc_stderr,none": 0.009995312065890348, "acc_norm,none": 0.5959595959595959, "acc_norm_stderr,none": 0.010069061649549549}, "boolq": {"alias": "boolq", "acc,none": 0.653211009174312, "acc_stderr,none": 0.008324380793263163}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1891891891891892, "acc_stderr,none": 0.011213159711868606}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.0446196043338474}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.39852619000199163, "acc_stderr,none": 0.004885942040894558, "acc_norm,none": 0.5212109141605258, "acc_norm_stderr,none": 0.004985289555586538}, "mmlu": {"acc,none": 0.2313060817547358, "acc_stderr,none": 0.0035528083156583234, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23995749202975558, "acc_stderr,none": 0.006225676467139033, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.03764950879790604}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591361}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516301}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.03192193448934722}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.02378620325550828}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23687150837988827, "acc_stderr,none": 0.014219570788103984}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19292604501607716, "acc_stderr,none": 0.022411516780911363}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.02289916291844581}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2438070404172099, "acc_stderr,none": 0.010966507972178473}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.23913743160605086, "acc_stderr,none": 0.007640442105058633, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.03063659134869983}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.029872577708891165}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23371647509578544, "acc_stderr,none": 0.015133383278988827}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02380518652488814}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.025518731049537766}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553027}, "mmlu_social_sciences": {"acc,none": 0.2219694507637309, "acc_stderr,none": 0.007476022106421978, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.04142439719489362}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.026552207828215293}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.02037766097037139}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.02684151432295894}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1908256880733945, "acc_stderr,none": 0.016847676400091122}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.017593486895366835}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878284}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546212}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.030360490154014652}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_stem": {"acc,none": 0.219790675547098, "acc_stderr,none": 0.0073570012484394775, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325436}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.031546980450822305}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3021276595744681, "acc_stderr,none": 0.030017554471880554}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.020539481261886875}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.02203721734026784}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15270935960591134, "acc_stderr,none": 0.025308904539380624}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.02504044387700069}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18981481481481483, "acc_stderr,none": 0.026744714834691943}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.09973404255319149, "exact_match_stderr,custom-extract": 0.002724570857119084, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.12552301255230125, "exact_match_stderr,custom-extract": 0.012381673804057348}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09632446134347275, "exact_match_stderr,custom-extract": 0.010510211344233737}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07597173144876325, "exact_match_stderr,custom-extract": 0.007878387512999461}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09268292682926829, "exact_match_stderr,custom-extract": 0.014338963443185472}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12796208530805686, "exact_match_stderr,custom-extract": 0.011505210023672541}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08152734778121776, "exact_match_stderr,custom-extract": 0.008795227818605757}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14180929095354522, "exact_match_stderr,custom-extract": 0.012204871709898273}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.08136482939632546, "exact_match_stderr,custom-extract": 0.014024845803977616}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07084468664850137, "exact_match_stderr,custom-extract": 0.007735732733830239}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07327905255366396, "exact_match_stderr,custom-extract": 0.007092470342788471}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11688311688311688, "exact_match_stderr,custom-extract": 0.010575091539720229}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13026052104208416, "exact_match_stderr,custom-extract": 0.015082951205521087}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.0962278675904542, "exact_match_stderr,custom-extract": 0.008185449955744845}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12656641604010024, "exact_match_stderr,custom-extract": 0.011777280638403529}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.236, "acc_stderr,none": 0.019008699622084735, "acc_norm,none": 0.346, "acc_norm_stderr,none": 0.02129495127723464}, "piqa": {"alias": "piqa", "acc,none": 0.6692056583242655, "acc_stderr,none": 0.010977520584714436, "acc_norm,none": 0.6730141458106638, "acc_norm_stderr,none": 0.010945157126978236}, "race": {"alias": "race", "acc,none": 0.35406698564593303, "acc_stderr,none": 0.014800834711677318}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4119754350051177, "acc_stderr,none": 0.011137360400975266}, "winogrande": {"alias": "winogrande", "acc,none": 0.590370955011839, "acc_stderr,none": 0.013821049109655478}} {"created_at": "2025-04-22T20:34:38.297496", "global_step": 220000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.31143344709897613, "acc_stderr,none": 0.013532472099850942, "acc_norm,none": 0.3532423208191126, "acc_norm_stderr,none": 0.013967822714840055}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6287878787878788, "acc_stderr,none": 0.009913599001845746, "acc_norm,none": 0.61489898989899, "acc_norm_stderr,none": 0.00998521479873725}, "boolq": {"alias": "boolq", "acc,none": 0.6256880733944954, "acc_stderr,none": 0.008464246656443238}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21375921375921375, "acc_stderr,none": 0.011737086112127206}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.39762995419239194, "acc_stderr,none": 0.004884079750433896, "acc_norm,none": 0.5199163513244374, "acc_norm_stderr,none": 0.004985821336146407}, "mmlu": {"acc,none": 0.2641361629397522, "acc_stderr,none": 0.0037095369055797967, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25759829968119025, "acc_stderr,none": 0.006374906524150141, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.31746031746031744, "acc_stderr,none": 0.04163453031302859}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.034277431758165236}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22058823529411764, "acc_stderr,none": 0.029102254389674082}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29957805907172996, "acc_stderr,none": 0.029818024749753095}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19834710743801653, "acc_stderr,none": 0.036401182719909456}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052192}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.0332201579577674}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2745664739884393, "acc_stderr,none": 0.024027745155265016}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.27150837988826815, "acc_stderr,none": 0.014874252168095271}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21543408360128619, "acc_stderr,none": 0.02335022547547143}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23765432098765432, "acc_stderr,none": 0.023683591837008557}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.258148631029987, "acc_stderr,none": 0.011176923719313397}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2573099415204678, "acc_stderr,none": 0.03352799844161865}, "mmlu_other": {"acc,none": 0.24686192468619247, "acc_stderr,none": 0.007720741758055842, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421255}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2943396226415094, "acc_stderr,none": 0.02804918631569525}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.14, "acc_stderr,none": 0.03487350880197771}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.23766816143497757, "acc_stderr,none": 0.028568079464714284}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.045416094465039476}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2264957264957265, "acc_stderr,none": 0.027421007295392912}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.21455938697318008, "acc_stderr,none": 0.014680033956893346}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2973856209150327, "acc_stderr,none": 0.02617390850671858}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.02564555362226673}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.29044117647058826, "acc_stderr,none": 0.02757646862274052}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25903614457831325, "acc_stderr,none": 0.03410646614071856}, "mmlu_social_sciences": {"acc,none": 0.289892752681183, "acc_stderr,none": 0.008155524875439836, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070643}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.34196891191709844, "acc_stderr,none": 0.034234651001042816}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3435897435897436, "acc_stderr,none": 0.024078696580635477}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.33613445378151263, "acc_stderr,none": 0.03068473711513536}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.29724770642201837, "acc_stderr,none": 0.019595707224643537}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.03641297081313729}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.01707737337785699}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.04172343038705383}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3306122448979592, "acc_stderr,none": 0.0301164262965406}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.031157150869355558}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.265778623533143, "acc_stderr,none": 0.00782350306045775, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.17037037037037037, "acc_stderr,none": 0.03247781185995593}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351585}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080342}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04690650298201942}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.19574468085106383, "acc_stderr,none": 0.025937853139977148}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.03375672449560553}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948368}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2806451612903226, "acc_stderr,none": 0.02556060472102289}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293752}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33112582781456956, "acc_stderr,none": 0.038425817186598696}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.39814814814814814, "acc_stderr,none": 0.033384734032074016}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "mmlu_pro": {"exact_match,custom-extract": 0.10064827127659574, "exact_match_stderr,custom-extract": 0.0027345488674096743, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.12133891213389121, "exact_match_stderr,custom-extract": 0.012202652228883252}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10646387832699619, "exact_match_stderr,custom-extract": 0.010987378600044841}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06802120141342756, "exact_match_stderr,custom-extract": 0.007486759168004525}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09024390243902439, "exact_match_stderr,custom-extract": 0.014168039768581506}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1457345971563981, "exact_match_stderr,custom-extract": 0.01215245311302899}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08875128998968008, "exact_match_stderr,custom-extract": 0.00914046145748861}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1356968215158924, "exact_match_stderr,custom-extract": 0.011981380605227186}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09973753280839895, "exact_match_stderr,custom-extract": 0.015371706524248107}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08356039963669391, "exact_match_stderr,custom-extract": 0.008343645336381295}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06661732050333087, "exact_match_stderr,custom-extract": 0.0067866673822465865}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12445887445887446, "exact_match_stderr,custom-extract": 0.01086551608988591}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12625250501002003, "exact_match_stderr,custom-extract": 0.014883268009546964}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08545034642032333, "exact_match_stderr,custom-extract": 0.007759311952095537}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12155388471177944, "exact_match_stderr,custom-extract": 0.011574782101911051}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.228, "acc_stderr,none": 0.018781306529363197, "acc_norm,none": 0.336, "acc_norm_stderr,none": 0.021144791425048853}, "piqa": {"alias": "piqa", "acc,none": 0.6800870511425462, "acc_stderr,none": 0.010882873582092051, "acc_norm,none": 0.6719260065288357, "acc_norm_stderr,none": 0.010954487135124213}, "race": {"alias": "race", "acc,none": 0.3320574162679426, "acc_stderr,none": 0.014575582129545916}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4165813715455476, "acc_stderr,none": 0.011155497599417951}, "winogrande": {"alias": "winogrande", "acc,none": 0.5951065509076559, "acc_stderr,none": 0.013795927003124943}} {"created_at": "2025-04-22T22:32:13.180251", "global_step": 222000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3412969283276451, "acc_stderr,none": 0.013855831287497724, "acc_norm,none": 0.3728668941979522, "acc_norm_stderr,none": 0.014131176760131167}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6477272727272727, "acc_stderr,none": 0.009801753933112774, "acc_norm,none": 0.6384680134680135, "acc_norm_stderr,none": 0.009858506543162053}, "boolq": {"alias": "boolq", "acc,none": 0.6305810397553517, "acc_stderr,none": 0.00844155753179963}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19492219492219492, "acc_stderr,none": 0.011341478090883528}, "copa": {"alias": "copa", "acc,none": 0.66, "acc_stderr,none": 0.04760952285695237}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.39842660824536946, "acc_stderr,none": 0.004885735963346908, "acc_norm,none": 0.5196176060545709, "acc_norm_stderr,none": 0.004985939292819573}, "mmlu": {"acc,none": 0.23785785500640935, "acc_stderr,none": 0.0035876357414686244, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24420828905419767, "acc_stderr,none": 0.006262745128486126, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.04073524322147126}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27941176470588236, "acc_stderr,none": 0.031493281045079556}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.02917868230484256}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516302}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854932}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0230836585869842}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21221864951768488, "acc_stderr,none": 0.023222756797435115}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.20679012345679013, "acc_stderr,none": 0.022535006705942818}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24445893089960888, "acc_stderr,none": 0.010976425013113897}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.03488647713457923}, "mmlu_other": {"acc,none": 0.25523012552301255, "acc_stderr,none": 0.00780312244013238, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899098}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173044}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.33183856502242154, "acc_stderr,none": 0.0316029514377668}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.029872577708891165}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26309067688378035, "acc_stderr,none": 0.015745497169049064}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.024170840879341033}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1801470588235294, "acc_stderr,none": 0.023345163616544838}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.034843315926805875}, "mmlu_social_sciences": {"acc,none": 0.22879428014299644, "acc_stderr,none": 0.00757471374499867, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02962022787479048}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803624}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.02102067268082791}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.02684151432295894}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.20550458715596331, "acc_stderr,none": 0.017324352325016022}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.036412970813137296}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25326797385620914, "acc_stderr,none": 0.017593486895366835}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.02580128347509051}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2736318407960199, "acc_stderr,none": 0.031524391865554016}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_stem": {"acc,none": 0.22010783380907073, "acc_stderr,none": 0.007360137286162457, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19736842105263158, "acc_stderr,none": 0.03238981601699397}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036624}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847415}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.03708284662416544}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292323}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.021935878081184756}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.18387096774193548, "acc_stderr,none": 0.022037217340267833}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1477832512315271, "acc_stderr,none": 0.024969621333521264}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.025787874220959347}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.18543046357615894, "acc_stderr,none": 0.031732843842942865}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.14814814814814814, "acc_stderr,none": 0.024227629273728363}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.10114694148936171, "exact_match_stderr,custom-extract": 0.002739583385303224, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13528591352859135, "exact_match_stderr,custom-extract": 0.012782212846937781}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10899873257287707, "exact_match_stderr,custom-extract": 0.011101630697795408}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.04328621908127209, "exact_match_stderr,custom-extract": 0.0060511029106954765}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11707317073170732, "exact_match_stderr,custom-extract": 0.015897520483461717}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.13033175355450238, "exact_match_stderr,custom-extract": 0.01159546417895376}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08565531475748193, "exact_match_stderr,custom-extract": 0.008994860895662196}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1198044009779951, "exact_match_stderr,custom-extract": 0.0113609579950746}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11811023622047244, "exact_match_stderr,custom-extract": 0.01655614119804243}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08537693006357856, "exact_match_stderr,custom-extract": 0.008425486761039498}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07549962990377498, "exact_match_stderr,custom-extract": 0.007190499688409158}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10714285714285714, "exact_match_stderr,custom-extract": 0.010180561923017176}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11823647294589178, "exact_match_stderr,custom-extract": 0.014468953704661764}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10469591993841416, "exact_match_stderr,custom-extract": 0.008497923442833757}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13909774436090225, "exact_match_stderr,custom-extract": 0.012257666634016472}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.248, "acc_stderr,none": 0.019332342821239103, "acc_norm,none": 0.36, "acc_norm_stderr,none": 0.02148775108972052}, "piqa": {"alias": "piqa", "acc,none": 0.6751904243743199, "acc_stderr,none": 0.010926296238294036, "acc_norm,none": 0.6746463547334058, "acc_norm_stderr,none": 0.010931036623525193}, "race": {"alias": "race", "acc,none": 0.3617224880382775, "acc_stderr,none": 0.014871072026717745}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41914022517911975, "acc_stderr,none": 0.01116514070817033}, "winogrande": {"alias": "winogrande", "acc,none": 0.5982636148382005, "acc_stderr,none": 0.013778439266649508}} {"created_at": "2025-04-23T00:42:52.333921", "global_step": 224000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.302901023890785, "acc_stderr,none": 0.013428241573185347, "acc_norm,none": 0.34982935153583616, "acc_norm_stderr,none": 0.01393680921215829}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6477272727272727, "acc_stderr,none": 0.009801753933112774, "acc_norm,none": 0.627104377104377, "acc_norm_stderr,none": 0.009922743197129243}, "boolq": {"alias": "boolq", "acc,none": 0.5574923547400612, "acc_stderr,none": 0.008687051315181372}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2203112203112203, "acc_stderr,none": 0.011865854943402445}, "copa": {"alias": "copa", "acc,none": 0.76, "acc_stderr,none": 0.04292346959909283}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.40151364270065726, "acc_stderr,none": 0.004892026457294718, "acc_norm,none": 0.523401712806214, "acc_norm_stderr,none": 0.004984313205791442}, "mmlu": {"acc,none": 0.2452642073778664, "acc_stderr,none": 0.003622640185829371, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2452709883103082, "acc_stderr,none": 0.006272997285036627, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.03764950879790607}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.03077855467869326}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302872}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3148148148148148, "acc_stderr,none": 0.04489931073591311}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615767}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.02317629820399201}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.021974198848265816}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2542372881355932, "acc_stderr,none": 0.011121129007840682}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.25146198830409355, "acc_stderr,none": 0.033275044238468436}, "mmlu_other": {"acc,none": 0.27035725780495656, "acc_stderr,none": 0.007939846876899264, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.02634148037111835}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483098}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3542600896860987, "acc_stderr,none": 0.03210062154134987}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.0376017800602662}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.31196581196581197, "acc_stderr,none": 0.03035152732334494}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2669220945083014, "acc_stderr,none": 0.015818450894777552}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.024954184324879912}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.026244920349843007}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.023886881922440328}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.035509201856896294}, "mmlu_social_sciences": {"acc,none": 0.23431914202144946, "acc_stderr,none": 0.0076300430224738895, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.039994238792813365}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21717171717171718, "acc_stderr,none": 0.029376616484945633}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803627}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21025641025641026, "acc_stderr,none": 0.02066059748502692}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2689075630252101, "acc_stderr,none": 0.028801392193631273}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23302752293577983, "acc_stderr,none": 0.018125669180861507}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.20610687022900764, "acc_stderr,none": 0.035477710041594626}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.017917974069594726}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.17959183673469387, "acc_stderr,none": 0.024573293589585637}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23383084577114427, "acc_stderr,none": 0.029929415408348377}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_stem": {"acc,none": 0.23120837297811608, "acc_stderr,none": 0.00748354544149584, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.03406542058502653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.17763157894736842, "acc_stderr,none": 0.031103182383123387}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566018}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774711}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.30638297872340425, "acc_stderr,none": 0.03013590647851756}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727773}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25132275132275134, "acc_stderr,none": 0.022340482339643898}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2064516129032258, "acc_stderr,none": 0.023025899617188733}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1724137931034483, "acc_stderr,none": 0.026577672183036572}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.02438843043398766}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.0347918557259966}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18055555555555555, "acc_stderr,none": 0.02623287897149166}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.35714285714285715, "acc_stderr,none": 0.04547960999764376}, "mmlu_pro": {"exact_match,custom-extract": 0.09965093085106383, "exact_match_stderr,custom-extract": 0.0027178050645804137, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11576011157601115, "exact_match_stderr,custom-extract": 0.011956608475421735}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08491761723700887, "exact_match_stderr,custom-extract": 0.009930380468209557}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.054770318021201414, "exact_match_stderr,custom-extract": 0.006765657432918792}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1048780487804878, "exact_match_stderr,custom-extract": 0.015150318019731046}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.13981042654028436, "exact_match_stderr,custom-extract": 0.011944090354236483}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08565531475748193, "exact_match_stderr,custom-extract": 0.008994860895662196}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12224938875305623, "exact_match_stderr,custom-extract": 0.011460350236485617}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.01717316362524469}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06993642143505904, "exact_match_stderr,custom-extract": 0.007689740413630032}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06291635825314582, "exact_match_stderr,custom-extract": 0.006608518078813461}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13744588744588745, "exact_match_stderr,custom-extract": 0.01133334735001625}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10420841683366733, "exact_match_stderr,custom-extract": 0.013691159072055356}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10007698229407236, "exact_match_stderr,custom-extract": 0.008329758962147552}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15413533834586465, "exact_match_stderr,custom-extract": 0.012790054353381215}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.232, "acc_stderr,none": 0.018896193591952038, "acc_norm,none": 0.356, "acc_norm_stderr,none": 0.02143471235607264}, "piqa": {"alias": "piqa", "acc,none": 0.6653971708378672, "acc_stderr,none": 0.011009071725162497, "acc_norm,none": 0.6713819368879217, "acc_norm_stderr,none": 0.010959127105167044}, "race": {"alias": "race", "acc,none": 0.3464114832535885, "acc_stderr,none": 0.014726451021782803}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.42528147389969295, "acc_stderr,none": 0.011187027585757611}, "winogrande": {"alias": "winogrande", "acc,none": 0.6022099447513812, "acc_stderr,none": 0.013755743513749025}} {"created_at": "2025-04-23T02:34:54.258036", "global_step": 226000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3387372013651877, "acc_stderr,none": 0.01383056892797433, "acc_norm,none": 0.3703071672354949, "acc_norm_stderr,none": 0.01411129875167495}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6494107744107744, "acc_stderr,none": 0.009791003829831564, "acc_norm,none": 0.6367845117845118, "acc_norm_stderr,none": 0.00986839713611881}, "boolq": {"alias": "boolq", "acc,none": 0.6495412844036698, "acc_stderr,none": 0.008344769634724851}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2031122031122031, "acc_stderr,none": 0.011518254793634093}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.0446196043338474}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3995220075682135, "acc_stderr,none": 0.004887991225950287, "acc_norm,none": 0.5240987851025692, "acc_norm_stderr,none": 0.004983982396187367}, "mmlu": {"acc,none": 0.2622133599202393, "acc_stderr,none": 0.0037105318187342427, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25674814027630183, "acc_stderr,none": 0.006371656106466685, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3253968253968254, "acc_stderr,none": 0.04190596438871136}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03346409881055953}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29957805907172996, "acc_stderr,none": 0.029818024749753095}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516301}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2822085889570552, "acc_stderr,none": 0.03536117886664743}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.023083658586984204}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2536312849162011, "acc_stderr,none": 0.014551553659369922}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2604501607717042, "acc_stderr,none": 0.024926723224845546}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2529335071707953, "acc_stderr,none": 0.01110226871383999}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.031581495393387345}, "mmlu_other": {"acc,none": 0.2648857418731896, "acc_stderr,none": 0.007922009267766707, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.26037735849056604, "acc_stderr,none": 0.02700876609070809}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.03391750322321659}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.23766816143497757, "acc_stderr,none": 0.028568079464714274}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.04498676320572924}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.02999695185834948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25287356321839083, "acc_stderr,none": 0.015543377313719681}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.025360603796242557}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25886524822695034, "acc_stderr,none": 0.026129572527180848}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.24632352941176472, "acc_stderr,none": 0.02617343857052}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25903614457831325, "acc_stderr,none": 0.034106466140718564}, "mmlu_social_sciences": {"acc,none": 0.28599285017874554, "acc_stderr,none": 0.008140444457891078, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281336}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.03318477333845331}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3160621761658031, "acc_stderr,none": 0.033553973696861736}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2743589743589744, "acc_stderr,none": 0.02262276576749322}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.030388353551886838}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3119266055045872, "acc_stderr,none": 0.019862967976707245}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.03880848301082393}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.017282760695167418}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.04461272175910507}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.02892058322067561}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.031157150869355568}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_stem": {"acc,none": 0.2445290199809705, "acc_stderr,none": 0.007656250394069891, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2236842105263158, "acc_stderr,none": 0.03391160934343602}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179963}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.028504856470514196}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3310344827586207, "acc_stderr,none": 0.03921545312467121}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948365}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24516129032258063, "acc_stderr,none": 0.02447224384089551}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1724137931034483, "acc_stderr,none": 0.02657767218303658}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21851851851851853, "acc_stderr,none": 0.025195752251823796}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360385}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25, "acc_stderr,none": 0.029531221160930918}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.042466243366976256}, "mmlu_pro": {"exact_match,custom-extract": 0.09507978723404255, "exact_match_stderr,custom-extract": 0.0026625148874761394, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13110181311018132, "exact_match_stderr,custom-extract": 0.012613403336459906}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11153358681875793, "exact_match_stderr,custom-extract": 0.011213991771867712}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.04946996466431095, "exact_match_stderr,custom-extract": 0.006447961264949632}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08292682926829269, "exact_match_stderr,custom-extract": 0.013636027558244166}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11848341232227488, "exact_match_stderr,custom-extract": 0.011130898789446782}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09700722394220847, "exact_match_stderr,custom-extract": 0.009512759072461442}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1295843520782396, "exact_match_stderr,custom-extract": 0.011749749223850723}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10761154855643044, "exact_match_stderr,custom-extract": 0.0158969794527234}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.0653950953678474, "exact_match_stderr,custom-extract": 0.007454015200467411}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06809770540340489, "exact_match_stderr,custom-extract": 0.00685621685567177}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1038961038961039, "exact_match_stderr,custom-extract": 0.010043335327351773}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11623246492985972, "exact_match_stderr,custom-extract": 0.014362104240159202}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.0731331793687452, "exact_match_stderr,custom-extract": 0.0072265090151180334}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14786967418546365, "exact_match_stderr,custom-extract": 0.01257370908494229}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.23, "acc_stderr,none": 0.018839050391123144, "acc_norm,none": 0.364, "acc_norm_stderr,none": 0.021539170637317695}, "piqa": {"alias": "piqa", "acc,none": 0.6773667029379761, "acc_stderr,none": 0.010907166359856602, "acc_norm,none": 0.6751904243743199, "acc_norm_stderr,none": 0.010926296238294036}, "race": {"alias": "race", "acc,none": 0.3406698564593301, "acc_stderr,none": 0.014667904380876562}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4022517911975435, "acc_stderr,none": 0.011095758951308078}, "winogrande": {"alias": "winogrande", "acc,none": 0.611681136543015, "acc_stderr,none": 0.013697456658457228}} {"created_at": "2025-04-23T04:17:42.168847", "global_step": 228000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3174061433447099, "acc_stderr,none": 0.01360223908803817, "acc_norm,none": 0.36689419795221845, "acc_norm_stderr,none": 0.014084133118104298}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6464646464646465, "acc_stderr,none": 0.009809728948151497, "acc_norm,none": 0.6220538720538721, "acc_norm_stderr,none": 0.009949405744045469}, "boolq": {"alias": "boolq", "acc,none": 0.48073394495412847, "acc_stderr,none": 0.008738560570551961}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19901719901719903, "acc_stderr,none": 0.011430809442838382}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3981278629755029, "acc_stderr,none": 0.0048851164655502755, "acc_norm,none": 0.5201155148376817, "acc_norm_stderr,none": 0.004985741706385719}, "mmlu": {"acc,none": 0.25174476570289134, "acc_stderr,none": 0.0036558106480554706, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2678002125398512, "acc_stderr,none": 0.00645414349748588, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604675}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.033175059300091805}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604236}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955924}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.04139112727635463}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3148148148148148, "acc_stderr,none": 0.04489931073591311}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.033220157957767414}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.02353292543104428}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2558659217877095, "acc_stderr,none": 0.014593620923210758}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22186495176848875, "acc_stderr,none": 0.02359885829286305}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24691358024691357, "acc_stderr,none": 0.02399350170904211}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2861799217731421, "acc_stderr,none": 0.011543642878150757}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.036155076303109344}, "mmlu_other": {"acc,none": 0.24943675571290633, "acc_stderr,none": 0.0077396892594442015, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899098}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173044}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.33183856502242154, "acc_stderr,none": 0.03160295143776679}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.03057281131029961}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24393358876117496, "acc_stderr,none": 0.015357212665829472}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21895424836601307, "acc_stderr,none": 0.02367908986180772}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880592}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1801470588235294, "acc_stderr,none": 0.02334516361654484}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.2414689632759181, "acc_stderr,none": 0.007708762339744231, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893624}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1919191919191919, "acc_stderr,none": 0.02805779167298901}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2538860103626943, "acc_stderr,none": 0.03141024780565318}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.22564102564102564, "acc_stderr,none": 0.021193632525148536}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.18907563025210083, "acc_stderr,none": 0.025435119438105353}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22385321100917432, "acc_stderr,none": 0.017871217767790222}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.037276735755969195}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.018249024411207668}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721376}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22857142857142856, "acc_stderr,none": 0.026882144922307744}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208954}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_stem": {"acc,none": 0.24008880431335236, "acc_stderr,none": 0.00758627592650215, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.034554737023254366}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03459777606810537}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566018}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.16, "acc_stderr,none": 0.0368452949177471}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.04488482852329017}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2723404255319149, "acc_stderr,none": 0.029101290698386705}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.021935878081184756}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2032258064516129, "acc_stderr,none": 0.022891687984554966}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.17733990147783252, "acc_stderr,none": 0.02687433727680835}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.025348097468097873}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31125827814569534, "acc_stderr,none": 0.03780445850526732}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18055555555555555, "acc_stderr,none": 0.02623287897149166}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.044328040552915185}, "mmlu_pro": {"exact_match,custom-extract": 0.0990691489361702, "exact_match_stderr,custom-extract": 0.002709286479595426, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11854951185495119, "exact_match_stderr,custom-extract": 0.012080706552248592}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09632446134347275, "exact_match_stderr,custom-extract": 0.010510211344233727}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05742049469964664, "exact_match_stderr,custom-extract": 0.006917690995369925}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09024390243902439, "exact_match_stderr,custom-extract": 0.014168039768581506}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14454976303317535, "exact_match_stderr,custom-extract": 0.012111342342350227}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10319917440660474, "exact_match_stderr,custom-extract": 0.00977796396738706}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12836185819070906, "exact_match_stderr,custom-extract": 0.011702403876565499}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11811023622047244, "exact_match_stderr,custom-extract": 0.01655614119804242}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.051771117166212535, "exact_match_stderr,custom-extract": 0.006680423815731711}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07253886010362694, "exact_match_stderr,custom-extract": 0.007059376421862613}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10822510822510822, "exact_match_stderr,custom-extract": 0.010225646711914654}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1342685370741483, "exact_match_stderr,custom-extract": 0.015277913884522439}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08160123171670516, "exact_match_stderr,custom-extract": 0.007598478818397718}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16165413533834586, "exact_match_stderr,custom-extract": 0.013039948073306443}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.232, "acc_stderr,none": 0.01889619359195204, "acc_norm,none": 0.364, "acc_norm_stderr,none": 0.021539170637317695}, "piqa": {"alias": "piqa", "acc,none": 0.6806311207834603, "acc_stderr,none": 0.010877964076613749, "acc_norm,none": 0.6811751904243744, "acc_norm_stderr,none": 0.010873037534333418}, "race": {"alias": "race", "acc,none": 0.34832535885167465, "acc_stderr,none": 0.014745439038731609}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4196519959058342, "acc_stderr,none": 0.01116703230339053}, "winogrande": {"alias": "winogrande", "acc,none": 0.601420678768745, "acc_stderr,none": 0.01376035717687384}} {"created_at": "2025-04-23T06:08:53.425463", "global_step": 230000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.33447098976109213, "acc_stderr,none": 0.013787460322441384, "acc_norm,none": 0.3660409556313993, "acc_norm_stderr,none": 0.014077223108470137}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.640993265993266, "acc_stderr,none": 0.009843424713072176, "acc_norm,none": 0.6153198653198653, "acc_norm_stderr,none": 0.009983171707009015}, "boolq": {"alias": "boolq", "acc,none": 0.5938837920489297, "acc_stderr,none": 0.00858951094378741}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20556920556920558, "acc_stderr,none": 0.011569834551534302}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.04461960433384741}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4008165704043019, "acc_stderr,none": 0.004890623693243626, "acc_norm,none": 0.5262895837482573, "acc_norm_stderr,none": 0.00498287934069141}, "mmlu": {"acc,none": 0.27018943170488535, "acc_stderr,none": 0.003748164387976969, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.26992561105207225, "acc_stderr,none": 0.006472319513132136, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.038095238095238106}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.03524390844511783}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3088235294117647, "acc_stderr,none": 0.03242661719827218}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2869198312236287, "acc_stderr,none": 0.029443773022594693}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.043913262867240704}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25153374233128833, "acc_stderr,none": 0.034089978868575295}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.02317629820399201}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24804469273743016, "acc_stderr,none": 0.01444415780826146}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.26366559485530544, "acc_stderr,none": 0.02502553850053234}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2839506172839506, "acc_stderr,none": 0.025089478523765127}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27509778357235987, "acc_stderr,none": 0.011405443620996929}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03377310252209194}, "mmlu_other": {"acc,none": 0.27550691985838427, "acc_stderr,none": 0.00800934357599636, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2943396226415094, "acc_stderr,none": 0.02804918631569525}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2947976878612717, "acc_stderr,none": 0.03476599607516478}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.030636591348699803}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.029872577708891155}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3001277139208174, "acc_stderr,none": 0.01638924969131743}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.026090162504279042}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.026011992930902013}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2426470588235294, "acc_stderr,none": 0.02604066247420127}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.18674698795180722, "acc_stderr,none": 0.030338749144500573}, "mmlu_social_sciences": {"acc,none": 0.2723431914202145, "acc_stderr,none": 0.008032678511049758, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.04142439719489359}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2828282828282828, "acc_stderr,none": 0.03208779558786751}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.29015544041450775, "acc_stderr,none": 0.03275264467791514}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.022421273612923707}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.02865749128507197}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27339449541284405, "acc_stderr,none": 0.019109299846098292}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467765}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2875816993464052, "acc_stderr,none": 0.018311653053648222}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2163265306122449, "acc_stderr,none": 0.026358916334904028}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.03096590312357302}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.26324135743736127, "acc_stderr,none": 0.007849702299311482, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04072314811876837}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640767}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462457}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776568}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.267741935483871, "acc_stderr,none": 0.02518900666021238}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22660098522167488, "acc_stderr,none": 0.02945486383529299}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.026202766534652148}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.038020397601079024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02988691054762697}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.04246624336697624}, "mmlu_pro": {"exact_match,custom-extract": 0.10787898936170212, "exact_match_stderr,custom-extract": 0.002815600715467029, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.14504881450488144, "exact_match_stderr,custom-extract": 0.013160465168737619}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10392902408111533, "exact_match_stderr,custom-extract": 0.01087117585687005}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05918727915194346, "exact_match_stderr,custom-extract": 0.007016725322511613}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11219512195121951, "exact_match_stderr,custom-extract": 0.015605730293675818}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1528436018957346, "exact_match_stderr,custom-extract": 0.012393433537406424}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07533539731682147, "exact_match_stderr,custom-extract": 0.008483088136190933}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12591687041564792, "exact_match_stderr,custom-extract": 0.011606661034408546}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13385826771653545, "exact_match_stderr,custom-extract": 0.017467280079326616}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09627611262488647, "exact_match_stderr,custom-extract": 0.00889366591573237}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07772020725388601, "exact_match_stderr,custom-extract": 0.007286709191616158}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12987012987012986, "exact_match_stderr,custom-extract": 0.011064857512116033}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12424849699398798, "exact_match_stderr,custom-extract": 0.014781596611020457}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09237875288683603, "exact_match_stderr,custom-extract": 0.008037130651407133}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16290726817042606, "exact_match_stderr,custom-extract": 0.013080605724028938}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.246, "acc_stderr,none": 0.01927981905635256, "acc_norm,none": 0.342, "acc_norm_stderr,none": 0.021236147199899254}, "piqa": {"alias": "piqa", "acc,none": 0.6675734494015234, "acc_stderr,none": 0.01099114155744559, "acc_norm,none": 0.6746463547334058, "acc_norm_stderr,none": 0.010931036623525191}, "race": {"alias": "race", "acc,none": 0.3339712918660287, "acc_stderr,none": 0.014596569299709724}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.413510747185261, "acc_stderr,none": 0.011143517724432124}, "winogrande": {"alias": "winogrande", "acc,none": 0.590370955011839, "acc_stderr,none": 0.013821049109655481}} {"created_at": "2025-04-23T07:46:30.287476", "global_step": 232000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3191126279863481, "acc_stderr,none": 0.013621696119173299, "acc_norm,none": 0.3660409556313993, "acc_norm_stderr,none": 0.014077223108470139}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6574074074074074, "acc_stderr,none": 0.009738105469984186, "acc_norm,none": 0.6367845117845118, "acc_norm_stderr,none": 0.009868397136118796}, "boolq": {"alias": "boolq", "acc,none": 0.6379204892966361, "acc_stderr,none": 0.00840577556824439}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19328419328419327, "acc_stderr,none": 0.011305207486827697}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.04461960433384741}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4034056960764788, "acc_stderr,none": 0.004895782107786497, "acc_norm,none": 0.5223063134833699, "acc_norm_stderr,none": 0.004984813391016208}, "mmlu": {"acc,none": 0.25765560461472725, "acc_stderr,none": 0.003686466155097131, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27523910733262485, "acc_stderr,none": 0.006509867953684484, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.03524390844511782}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.031660096793998116}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955917}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.371900826446281, "acc_stderr,none": 0.04412015806624504}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.0395783547198098}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.294478527607362, "acc_stderr,none": 0.03581165790474082}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.29190751445086704, "acc_stderr,none": 0.024476994076247333}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.29260450160771706, "acc_stderr,none": 0.02583989833487798}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2716049382716049, "acc_stderr,none": 0.02474862449053737}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27835723598435463, "acc_stderr,none": 0.011446990197380985}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824563}, "mmlu_other": {"acc,none": 0.24493080141615706, "acc_stderr,none": 0.007701345678299176, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.025907897122408173}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.0339175032232166}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.20179372197309417, "acc_stderr,none": 0.02693611191280227}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822585}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26947637292464877, "acc_stderr,none": 0.015866243073215047}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.024954184324879905}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2801418439716312, "acc_stderr,none": 0.026789172351140245}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16911764705882354, "acc_stderr,none": 0.022770868010113018}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.03141784291663925}, "mmlu_social_sciences": {"acc,none": 0.24081897952551187, "acc_stderr,none": 0.0077098151491087575, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2474747474747475, "acc_stderr,none": 0.03074630074212451}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.03003114797764154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2128205128205128, "acc_stderr,none": 0.020752423722128013}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20168067226890757, "acc_stderr,none": 0.026064313406304527}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23302752293577983, "acc_stderr,none": 0.0181256691808615}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728744}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.018054027458815205}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.19090909090909092, "acc_stderr,none": 0.03764425585984924}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2653061224489796, "acc_stderr,none": 0.0282638899437846}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916718}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_stem": {"acc,none": 0.2603869330796067, "acc_stderr,none": 0.007808408851700926, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.32592592592592595, "acc_stderr,none": 0.040491220417025055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.03715062154998904}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20425531914893616, "acc_stderr,none": 0.02635515841334943}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.022717467897708624}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.02489246917246284}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.031947400722655395}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252603}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.20833333333333334, "acc_stderr,none": 0.027696910713093936}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952687}, "mmlu_pro": {"exact_match,custom-extract": 0.10247672872340426, "exact_match_stderr,custom-extract": 0.0027583385965322723, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.12552301255230125, "exact_match_stderr,custom-extract": 0.012381673804057346}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10266159695817491, "exact_match_stderr,custom-extract": 0.010812323380686599}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07420494699646643, "exact_match_stderr,custom-extract": 0.007793679728569101}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08780487804878048, "exact_match_stderr,custom-extract": 0.013993989404782777}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11966824644549763, "exact_match_stderr,custom-extract": 0.011178894558789447}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08771929824561403, "exact_match_stderr,custom-extract": 0.009092308015144589}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1466992665036675, "exact_match_stderr,custom-extract": 0.012378100440505432}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10498687664041995, "exact_match_stderr,custom-extract": 0.015724991203554552}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09900090826521345, "exact_match_stderr,custom-extract": 0.009005035380672355}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07105847520355292, "exact_match_stderr,custom-extract": 0.006992544617386941}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10930735930735931, "exact_match_stderr,custom-extract": 0.010270410036987688}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13026052104208416, "exact_match_stderr,custom-extract": 0.015082951205521087}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08852963818321787, "exact_match_stderr,custom-extract": 0.007884574735319075}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13784461152882205, "exact_match_stderr,custom-extract": 0.012211204647685912}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.246, "acc_stderr,none": 0.019279819056352555, "acc_norm,none": 0.35, "acc_norm_stderr,none": 0.021352091786223108}, "piqa": {"alias": "piqa", "acc,none": 0.6702937976060935, "acc_stderr,none": 0.010968357083095152, "acc_norm,none": 0.6730141458106638, "acc_norm_stderr,none": 0.010945157126978243}, "race": {"alias": "race", "acc,none": 0.36076555023923446, "acc_stderr,none": 0.014862517074604979}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41299897645854655, "acc_stderr,none": 0.011141477698035243}, "winogrande": {"alias": "winogrande", "acc,none": 0.6006314127861089, "acc_stderr,none": 0.013764933546717616}} {"created_at": "2025-04-23T09:45:59.057088", "global_step": 234000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3319112627986348, "acc_stderr,none": 0.013760988200880541, "acc_norm,none": 0.36177474402730375, "acc_norm_stderr,none": 0.014041957945038078}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.640993265993266, "acc_stderr,none": 0.009843424713072176, "acc_norm,none": 0.6275252525252525, "acc_norm_stderr,none": 0.00992046921573602}, "boolq": {"alias": "boolq", "acc,none": 0.6510703363914373, "acc_stderr,none": 0.008336340399970105}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20393120393120392, "acc_stderr,none": 0.011535521334313655}, "copa": {"alias": "copa", "acc,none": 0.74, "acc_stderr,none": 0.0440844002276808}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.40141406094403503, "acc_stderr,none": 0.004891826692722825, "acc_norm,none": 0.525592511451902, "acc_norm_stderr,none": 0.004983240744101381}, "mmlu": {"acc,none": 0.26100270616721266, "acc_stderr,none": 0.0037039387477209042, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24399574920297556, "acc_stderr,none": 0.006263108626309246, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.31746031746031744, "acc_stderr,none": 0.04163453031302859}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03477691162163659}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604243}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1652892561983471, "acc_stderr,none": 0.033907806129727755}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.042365112580946315}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212696}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.023445826276545546}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24804469273743016, "acc_stderr,none": 0.01444415780826146}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22508038585209003, "acc_stderr,none": 0.02372008851617903}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.023788583551658533}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24119947848761408, "acc_stderr,none": 0.010926496102034966}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.031581495393387324}, "mmlu_other": {"acc,none": 0.2661731573865465, "acc_stderr,none": 0.007923517296291356, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.30566037735849055, "acc_stderr,none": 0.028353298073322666}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.03345036916788991}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21076233183856502, "acc_stderr,none": 0.027373095500540193}, "mmlu_management": {"alias": " - management", "acc,none": 0.33980582524271846, "acc_stderr,none": 0.04689765937278134}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24648786717752236, "acc_stderr,none": 0.015411308769686929}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2973856209150327, "acc_stderr,none": 0.026173908506718576}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22426470588235295, "acc_stderr,none": 0.02533684856333238}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.03484331592680589}, "mmlu_social_sciences": {"acc,none": 0.27689307767305815, "acc_stderr,none": 0.008059586373454069, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.0331847733384533}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2694300518134715, "acc_stderr,none": 0.03201867122877794}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.32564102564102565, "acc_stderr,none": 0.02375966576741229}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31932773109243695, "acc_stderr,none": 0.030283995525884396}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27155963302752295, "acc_stderr,none": 0.019069098363191445}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2748091603053435, "acc_stderr,none": 0.039153454088478354}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.017035229258034034}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.041723430387053825}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2938775510204082, "acc_stderr,none": 0.029162738410249776}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2835820895522388, "acc_stderr,none": 0.03187187537919799}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_stem": {"acc,none": 0.265778623533143, "acc_stderr,none": 0.007870042723677898, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.038201699145179055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640767}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364395}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.19574468085106383, "acc_stderr,none": 0.025937853139977148}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.022418042891113935}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27741935483870966, "acc_stderr,none": 0.025470196835900055}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293752}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.027195934804085626}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.035118075718047245}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.03191923445686186}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.042466243366976256}, "mmlu_pro": {"exact_match,custom-extract": 0.10571808510638298, "exact_match_stderr,custom-extract": 0.002787122774449549, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.14504881450488144, "exact_match_stderr,custom-extract": 0.013160465168737609}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11153358681875793, "exact_match_stderr,custom-extract": 0.011213991771867698}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05918727915194346, "exact_match_stderr,custom-extract": 0.007016725322511607}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12195121951219512, "exact_match_stderr,custom-extract": 0.0161804554422017}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.13744075829383887, "exact_match_stderr,custom-extract": 0.011858737350418426}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08359133126934984, "exact_match_stderr,custom-extract": 0.008895851747412878}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1430317848410758, "exact_match_stderr,custom-extract": 0.012248632595040434}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.15485564304461943, "exact_match_stderr,custom-extract": 0.018558256274560207}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09264305177111716, "exact_match_stderr,custom-extract": 0.0087417658258632}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.05995558845299778, "exact_match_stderr,custom-extract": 0.006461333188096242}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12229437229437229, "exact_match_stderr,custom-extract": 0.01078392421390481}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12424849699398798, "exact_match_stderr,custom-extract": 0.014781596611020425}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07775211701308699, "exact_match_stderr,custom-extract": 0.007432631448995912}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16416040100250626, "exact_match_stderr,custom-extract": 0.013120987227657867}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.24, "acc_stderr,none": 0.01911886665375974, "acc_norm,none": 0.364, "acc_norm_stderr,none": 0.02153917063731769}, "piqa": {"alias": "piqa", "acc,none": 0.6697497279651795, "acc_stderr,none": 0.010972947133006285, "acc_norm,none": 0.6702937976060935, "acc_norm_stderr,none": 0.010968357083095152}, "race": {"alias": "race", "acc,none": 0.3416267942583732, "acc_stderr,none": 0.014677827770761077}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4232343909928352, "acc_stderr,none": 0.011179928646626206}, "winogrande": {"alias": "winogrande", "acc,none": 0.5887924230465666, "acc_stderr,none": 0.013829128358676857}} {"created_at": "2025-04-23T11:37:51.917246", "global_step": 236000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.32593856655290104, "acc_stderr,none": 0.01369743246669324, "acc_norm,none": 0.3609215017064846, "acc_norm_stderr,none": 0.014034761386175465}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6435185185185185, "acc_stderr,none": 0.009828046544504428, "acc_norm,none": 0.5917508417508418, "acc_norm_stderr,none": 0.010085566195791259}, "boolq": {"alias": "boolq", "acc,none": 0.6602446483180429, "acc_stderr,none": 0.00828377201314756}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091195}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.0446196043338474}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3972316271659032, "acc_stderr,none": 0.004883246579496663, "acc_norm,none": 0.5230033857797252, "acc_norm_stderr,none": 0.004984497871025246}, "mmlu": {"acc,none": 0.25623130608175476, "acc_stderr,none": 0.0036804055235331737, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.26609989373007437, "acc_stderr,none": 0.006435433428206487, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848878}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.03283472056108566}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.31645569620253167, "acc_stderr,none": 0.030274974880218974}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.32231404958677684, "acc_stderr,none": 0.04266416363352167}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3425925925925926, "acc_stderr,none": 0.045879047413018105}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.20245398773006135, "acc_stderr,none": 0.03157065078911903}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25027932960893856, "acc_stderr,none": 0.014487500852850423}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22186495176848875, "acc_stderr,none": 0.02359885829286305}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2777053455019557, "acc_stderr,none": 0.011438741422769575}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.033773102522091945}, "mmlu_other": {"acc,none": 0.2645638879948503, "acc_stderr,none": 0.007905827472069844, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2641509433962264, "acc_stderr,none": 0.02713429162874171}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.031862098516411454}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3004484304932735, "acc_stderr,none": 0.030769352008229136}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646035}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674043}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26181353767560667, "acc_stderr,none": 0.01572083867844526}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.28104575163398693, "acc_stderr,none": 0.025738854797818737}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1948529411764706, "acc_stderr,none": 0.02406059942348742}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233134}, "mmlu_social_sciences": {"acc,none": 0.24666883327916803, "acc_stderr,none": 0.007772273229178489, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.19298245614035087, "acc_stderr,none": 0.037124548537213684}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2676767676767677, "acc_stderr,none": 0.03154449888270286}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.25906735751295334, "acc_stderr,none": 0.03161877917935408}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.22564102564102564, "acc_stderr,none": 0.021193632525148522}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25630252100840334, "acc_stderr,none": 0.02835962087053395}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23119266055045873, "acc_stderr,none": 0.01807575024163315}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.29770992366412213, "acc_stderr,none": 0.04010358942462203}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26633986928104575, "acc_stderr,none": 0.0178831881346672}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.19090909090909092, "acc_stderr,none": 0.03764425585984926}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22040816326530613, "acc_stderr,none": 0.026537045312145312}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208954}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.24262607040913417, "acc_stderr,none": 0.007629486100733183, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.037125378336148665}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2986111111111111, "acc_stderr,none": 0.038270523579507554}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774709}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847415}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322674}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.1568627450980392, "acc_stderr,none": 0.036186648199362466}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292326}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948365}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24193548387096775, "acc_stderr,none": 0.024362599693031096}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.19704433497536947, "acc_stderr,none": 0.02798672466673622}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22592592592592592, "acc_stderr,none": 0.02549753263960955}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.035433042343899844}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.027467401804057982}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.10139627659574468, "exact_match_stderr,custom-extract": 0.0027412485622756453, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13389121338912133, "exact_match_stderr,custom-extract": 0.012726405288731832}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10899873257287707, "exact_match_stderr,custom-extract": 0.011101630697795408}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05830388692579505, "exact_match_stderr,custom-extract": 0.006967433636014823}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0951219512195122, "exact_match_stderr,custom-extract": 0.014506870947377837}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.15165876777251186, "exact_match_stderr,custom-extract": 0.012353933579536036}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07636738906088751, "exact_match_stderr,custom-extract": 0.008536226336689346}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12836185819070906, "exact_match_stderr,custom-extract": 0.011702403876565513}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10236220472440945, "exact_match_stderr,custom-extract": 0.015549935163883123}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10717529518619437, "exact_match_stderr,custom-extract": 0.00932683086037974}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06513693560325684, "exact_match_stderr,custom-extract": 0.006716156044746726}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10822510822510822, "exact_match_stderr,custom-extract": 0.010225646711914631}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11422845691382766, "exact_match_stderr,custom-extract": 0.014253888115016522}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08622016936104696, "exact_match_stderr,custom-extract": 0.007790904368268136}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14035087719298245, "exact_match_stderr,custom-extract": 0.012303793034734636}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.238, "acc_stderr,none": 0.019064072958198456, "acc_norm,none": 0.35, "acc_norm_stderr,none": 0.021352091786223104}, "piqa": {"alias": "piqa", "acc,none": 0.676822633297062, "acc_stderr,none": 0.01091197412428213, "acc_norm,none": 0.6822633297062024, "acc_norm_stderr,none": 0.010863133246569286}, "race": {"alias": "race", "acc,none": 0.34258373205741627, "acc_stderr,none": 0.014687684737145162}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4216990788126919, "acc_stderr,none": 0.011174475471772093}, "winogrande": {"alias": "winogrande", "acc,none": 0.5832675611681136, "acc_stderr,none": 0.013856250072796315}} {"created_at": "2025-04-23T13:29:46.238540", "global_step": 238000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.32593856655290104, "acc_stderr,none": 0.013697432466693242, "acc_norm,none": 0.3728668941979522, "acc_norm_stderr,none": 0.014131176760131167}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6481481481481481, "acc_stderr,none": 0.009799078929868706, "acc_norm,none": 0.6207912457912458, "acc_norm_stderr,none": 0.00995589166886556}, "boolq": {"alias": "boolq", "acc,none": 0.582262996941896, "acc_stderr,none": 0.008625883905552701}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.18755118755118755, "acc_stderr,none": 0.011175783964114741}, "copa": {"alias": "copa", "acc,none": 0.74, "acc_stderr,none": 0.0440844002276808}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4025094602668791, "acc_stderr,none": 0.004894012555642644, "acc_norm,none": 0.5239992033459471, "acc_norm_stderr,none": 0.004984030250507295}, "mmlu": {"acc,none": 0.23116365190143853, "acc_stderr,none": 0.0035522183282581964, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24335812964930925, "acc_stderr,none": 0.006253498588003139, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.040406101782088394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501943}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.021974198848265816}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2191358024691358, "acc_stderr,none": 0.023016705640262196}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2529335071707953, "acc_stderr,none": 0.011102268713839987}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.24396523978113938, "acc_stderr,none": 0.007692314716113094, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891366}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398687}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912258}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290382}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142317}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.2170945726356841, "acc_stderr,none": 0.00742772073174181, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19170984455958548, "acc_stderr,none": 0.028408953626245282}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.02047323317355198}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936094}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.01755581809132227}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.1836734693877551, "acc_stderr,none": 0.024789071332007657}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.21408182683158897, "acc_stderr,none": 0.007292448925857878, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.0335567721631314}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.031546980450822305}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21164021164021163, "acc_stderr,none": 0.021037331505262893}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1774193548387097, "acc_stderr,none": 0.02173254068932927}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15270935960591134, "acc_stderr,none": 0.025308904539380627}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.025416428388767478}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291518}, "mmlu_pro": {"exact_match,custom-extract": 0.1015625, "exact_match_stderr,custom-extract": 0.002737114931237238, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.15341701534170155, "exact_match_stderr,custom-extract": 0.013468369459638495}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09125475285171103, "exact_match_stderr,custom-extract": 0.010258543729935026}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.045936395759717315, "exact_match_stderr,custom-extract": 0.006224949323477467}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12439024390243902, "exact_match_stderr,custom-extract": 0.016318746710195602}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1504739336492891, "exact_match_stderr,custom-extract": 0.012314171688222526}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1021671826625387, "exact_match_stderr,custom-extract": 0.009734547484109861}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1136919315403423, "exact_match_stderr,custom-extract": 0.011105705318793036}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12598425196850394, "exact_match_stderr,custom-extract": 0.017022602638569497}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06357856494096276, "exact_match_stderr,custom-extract": 0.007356897259839131}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06365655070318282, "exact_match_stderr,custom-extract": 0.006644652222081495}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12229437229437229, "exact_match_stderr,custom-extract": 0.010783924213904813}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11022044088176353, "exact_match_stderr,custom-extract": 0.014033229017364503}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09314857582755966, "exact_match_stderr,custom-extract": 0.008067125867388525}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15664160401002505, "exact_match_stderr,custom-extract": 0.012874503408736518}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.26, "acc_stderr,none": 0.019635965529725512, "acc_norm,none": 0.358, "acc_norm_stderr,none": 0.02146143486285912}, "piqa": {"alias": "piqa", "acc,none": 0.6664853101196954, "acc_stderr,none": 0.011000139592184575, "acc_norm,none": 0.6653971708378672, "acc_norm_stderr,none": 0.011009071725162497}, "race": {"alias": "race", "acc,none": 0.3473684210526316, "acc_stderr,none": 0.014735977850381395}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40890481064483114, "acc_stderr,none": 0.011124710055682831}, "winogrande": {"alias": "winogrande", "acc,none": 0.6006314127861089, "acc_stderr,none": 0.013764933546717616}} {"created_at": "2025-04-23T15:19:20.502623", "global_step": 240000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3455631399317406, "acc_stderr,none": 0.013896938461145687, "acc_norm,none": 0.3890784982935154, "acc_norm_stderr,none": 0.014247309976045607}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6679292929292929, "acc_stderr,none": 0.009663817543072703, "acc_norm,none": 0.6321548821548821, "acc_norm_stderr,none": 0.009894923464455193}, "boolq": {"alias": "boolq", "acc,none": 0.65565749235474, "acc_stderr,none": 0.00831048505478299}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20884520884520885, "acc_stderr,none": 0.011637590576063062}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.0446196043338474}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4033061143198566, "acc_stderr,none": 0.004895586329401319, "acc_norm,none": 0.5267874925313683, "acc_norm_stderr,none": 0.004982615233057101}, "mmlu": {"acc,none": 0.2555903717419171, "acc_stderr,none": 0.0036757144236735126, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2456960680127524, "acc_stderr,none": 0.0062806912371718765, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.04073524322147126}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.033175059300091805}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.04133119440243839}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.20245398773006135, "acc_stderr,none": 0.031570650789119026}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0230836585869842}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.01426555419233115}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2315112540192926, "acc_stderr,none": 0.023956532766639133}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23765432098765432, "acc_stderr,none": 0.02368359183700856}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2503259452411995, "acc_stderr,none": 0.011064151027165426}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.25146198830409355, "acc_stderr,none": 0.033275044238468436}, "mmlu_other": {"acc,none": 0.26746057289990344, "acc_stderr,none": 0.007921664458628401, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.30943396226415093, "acc_stderr,none": 0.028450154794118627}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3063583815028902, "acc_stderr,none": 0.035149425512674394}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.242152466367713, "acc_stderr,none": 0.028751392398694755}, "mmlu_management": {"alias": " - management", "acc,none": 0.3786407766990291, "acc_stderr,none": 0.04802694698258973}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.029202540153431187}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24904214559386972, "acc_stderr,none": 0.015464676163395974}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3104575163398693, "acc_stderr,none": 0.026493033225145894}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25886524822695034, "acc_stderr,none": 0.026129572527180848}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20955882352941177, "acc_stderr,none": 0.02472311040767707}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21686746987951808, "acc_stderr,none": 0.03208284450356365}, "mmlu_social_sciences": {"acc,none": 0.2674683132921677, "acc_stderr,none": 0.00795841961910696, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.03318477333845331}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2538860103626943, "acc_stderr,none": 0.03141024780565318}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.36153846153846153, "acc_stderr,none": 0.024359581465396993}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.028657491285071963}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26972477064220185, "acc_stderr,none": 0.01902848671111545}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.23366013071895425, "acc_stderr,none": 0.017119158496044503}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.027212835884073163}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.030965903123573023}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_stem": {"acc,none": 0.2470662860767523, "acc_stderr,none": 0.007673376505242298, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.037857144650666544}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.03852084696008534}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.037932811853078126}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.19574468085106383, "acc_stderr,none": 0.025937853139977148}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309994}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24338624338624337, "acc_stderr,none": 0.022101128787415412}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.024892469172462836}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1921182266009852, "acc_stderr,none": 0.027719315709614775}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.026842057873833706}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.035118075718047245}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.17592592592592593, "acc_stderr,none": 0.025967420958258526}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340456}, "mmlu_pro": {"exact_match,custom-extract": 0.09973404255319149, "exact_match_stderr,custom-extract": 0.0027215129611774095, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11157601115760112, "exact_match_stderr,custom-extract": 0.011766276311081417}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09252217997465145, "exact_match_stderr,custom-extract": 0.010322332141863089}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0636042402826855, "exact_match_stderr,custom-extract": 0.0072567381353429465}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08292682926829269, "exact_match_stderr,custom-extract": 0.013636027558244172}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14218009478672985, "exact_match_stderr,custom-extract": 0.012028283958485734}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08152734778121776, "exact_match_stderr,custom-extract": 0.00879522781860575}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1295843520782396, "exact_match_stderr,custom-extract": 0.011749749223850742}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12335958005249344, "exact_match_stderr,custom-extract": 0.016869623436798518}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07175295186194369, "exact_match_stderr,custom-extract": 0.007781356843650031}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06513693560325684, "exact_match_stderr,custom-extract": 0.006716156044746727}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12229437229437229, "exact_match_stderr,custom-extract": 0.010783924213904813}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10420841683366733, "exact_match_stderr,custom-extract": 0.013691159072055335}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1077752117013087, "exact_match_stderr,custom-extract": 0.008607147361594167}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14661654135338345, "exact_match_stderr,custom-extract": 0.012529520031289975}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.24, "acc_stderr,none": 0.019118866653759753, "acc_norm,none": 0.36, "acc_norm_stderr,none": 0.02148775108972052}, "piqa": {"alias": "piqa", "acc,none": 0.676278563656148, "acc_stderr,none": 0.010916765010708759, "acc_norm,none": 0.6898803046789989, "acc_norm_stderr,none": 0.010791876566843033}, "race": {"alias": "race", "acc,none": 0.3444976076555024, "acc_stderr,none": 0.014707199932728218}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.39508700102354144, "acc_stderr,none": 0.011062205128594174}, "winogrande": {"alias": "winogrande", "acc,none": 0.5966850828729282, "acc_stderr,none": 0.013787257285896231}} {"created_at": "2025-04-23T16:37:23.230611", "global_step": 242000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.33361774744027306, "acc_stderr,none": 0.01377868705417654, "acc_norm,none": 0.371160409556314, "acc_norm_stderr,none": 0.014117971901142817}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6599326599326599, "acc_stderr,none": 0.00972076549480528, "acc_norm,none": 0.6325757575757576, "acc_norm_stderr,none": 0.009892552616211558}, "boolq": {"alias": "boolq", "acc,none": 0.6085626911314985, "acc_stderr,none": 0.00853643052440395}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19246519246519248, "acc_stderr,none": 0.011286955409752646}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.0446196043338474}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.40001991635132444, "acc_stderr,none": 0.0048890079212147, "acc_norm,none": 0.5222067317267477, "acc_norm_stderr,none": 0.0049848576711871}, "mmlu": {"acc,none": 0.25430850306224184, "acc_stderr,none": 0.0036714682934243636, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24718384697130713, "acc_stderr,none": 0.006287979554240575, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.03567016675276863}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.035243908445117836}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3037974683544304, "acc_stderr,none": 0.029936696387138608}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1652892561983471, "acc_stderr,none": 0.03390780612972776}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.04133119440243839}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.023176298203992005}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2435754189944134, "acc_stderr,none": 0.014355911964767865}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.27009646302250806, "acc_stderr,none": 0.02521804037341062}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22839506172839505, "acc_stderr,none": 0.023358211840626267}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24771838331160365, "acc_stderr,none": 0.011025499291443737}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.03301405946987251}, "mmlu_other": {"acc,none": 0.26649501126488573, "acc_stderr,none": 0.007915680383080772, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.26037735849056604, "acc_stderr,none": 0.02700876609070809}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.030631145539198816}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3452914798206278, "acc_stderr,none": 0.03191100192835794}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.04453254836326468}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.02948036054954119}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2886334610472541, "acc_stderr,none": 0.016203792703197797}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.02440439492808787}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25886524822695034, "acc_stderr,none": 0.026129572527180848}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19852941176470587, "acc_stderr,none": 0.024231013370541097}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3253012048192771, "acc_stderr,none": 0.03647168523683229}, "mmlu_social_sciences": {"acc,none": 0.25089372765680856, "acc_stderr,none": 0.0078141475767021, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.029857515673386396}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.030276909945178256}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.02248938979365481}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176896}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24036697247706423, "acc_stderr,none": 0.01832060732096407}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.1984732824427481, "acc_stderr,none": 0.0349814938546247}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28104575163398693, "acc_stderr,none": 0.018185218954318075}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.27755102040816326, "acc_stderr,none": 0.02866685779027465}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23383084577114427, "acc_stderr,none": 0.02992941540834838}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_stem": {"acc,none": 0.2562638756739613, "acc_stderr,none": 0.007766599394992301, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174022}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.26973684210526316, "acc_stderr,none": 0.03611780560284898}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237654}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33191489361702126, "acc_stderr,none": 0.03078373675774564}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.03695183311650232}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.02489246917246284}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.03210494433751458}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844072}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.032162984205936156}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.026991454502036712}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03894641120044792}, "mmlu_pro": {"exact_match,custom-extract": 0.10347406914893617, "exact_match_stderr,custom-extract": 0.0027635626215441896, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.07252440725244072, "exact_match_stderr,custom-extract": 0.00969252271883848}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11787072243346007, "exact_match_stderr,custom-extract": 0.011486983100199015}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.045053003533568906, "exact_match_stderr,custom-extract": 0.006167656890375406}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13414634146341464, "exact_match_stderr,custom-extract": 0.016851944127279115}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1481042654028436, "exact_match_stderr,custom-extract": 0.012233851872561223}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.11351909184726522, "exact_match_stderr,custom-extract": 0.010196038549390666}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13080684596577016, "exact_match_stderr,custom-extract": 0.011796749495986752}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13385826771653545, "exact_match_stderr,custom-extract": 0.017467280079326603}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08991825613079019, "exact_match_stderr,custom-extract": 0.008625172638334894}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07475943745373798, "exact_match_stderr,custom-extract": 0.007158029108285762}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10064935064935066, "exact_match_stderr,custom-extract": 0.009903054392913758}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11623246492985972, "exact_match_stderr,custom-extract": 0.014362104240159239}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09468822170900693, "exact_match_stderr,custom-extract": 0.008126615633066912}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15914786967418545, "exact_match_stderr,custom-extract": 0.01295779391533379}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.246, "acc_stderr,none": 0.019279819056352558, "acc_norm,none": 0.368, "acc_norm_stderr,none": 0.021588982568353548}, "piqa": {"alias": "piqa", "acc,none": 0.6681175190424374, "acc_stderr,none": 0.01098661777636159, "acc_norm,none": 0.6692056583242655, "acc_norm_stderr,none": 0.010977520584714436}, "race": {"alias": "race", "acc,none": 0.3397129186602871, "acc_stderr,none": 0.014657914432586402}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4216990788126919, "acc_stderr,none": 0.011174475471772088}, "winogrande": {"alias": "winogrande", "acc,none": 0.5951065509076559, "acc_stderr,none": 0.01379592700312494}} {"created_at": "2025-04-23T18:47:14.951589", "global_step": 244000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.32337883959044367, "acc_stderr,none": 0.013669421630012134, "acc_norm,none": 0.3626279863481229, "acc_norm_stderr,none": 0.014049106564955007}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6452020202020202, "acc_stderr,none": 0.009817629113069697, "acc_norm,none": 0.5972222222222222, "acc_norm_stderr,none": 0.010063960494989163}, "boolq": {"alias": "boolq", "acc,none": 0.6648318042813456, "acc_stderr,none": 0.008256192949796775}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2031122031122031, "acc_stderr,none": 0.011518254793634088}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.40380402310296754, "acc_stderr,none": 0.0048965631261168084, "acc_norm,none": 0.5237004580760805, "acc_norm_stderr,none": 0.004984172621822884}, "mmlu": {"acc,none": 0.2538812135023501, "acc_stderr,none": 0.0036709793744858976, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2609989373007439, "acc_stderr,none": 0.006405026507192017, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.03809523809523811}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.031321798030832904}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25316455696202533, "acc_stderr,none": 0.028304657943035286}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2975206611570248, "acc_stderr,none": 0.04173349148083498}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.04524596007030049}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.03322015795776741}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.26878612716763006, "acc_stderr,none": 0.023868003262500118}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2435754189944134, "acc_stderr,none": 0.014355911964767864}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24437299035369775, "acc_stderr,none": 0.024406162094668886}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24691358024691357, "acc_stderr,none": 0.02399350170904211}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27444589308996087, "acc_stderr,none": 0.011397043163078154}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.03446296217088427}, "mmlu_other": {"acc,none": 0.2568393949147087, "acc_stderr,none": 0.007827666857370652, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2339622641509434, "acc_stderr,none": 0.026055296901152915}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.033450369167889904}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2825112107623318, "acc_stderr,none": 0.03021683101150878}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822584}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.029202540153431177}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2720306513409962, "acc_stderr,none": 0.01591336744750052}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.024954184324879905}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880592}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16911764705882354, "acc_stderr,none": 0.02277086801011302}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.24829379265518361, "acc_stderr,none": 0.007788049660555463, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022057}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.031156269519646836}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.02869787397186068}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2282051282051282, "acc_stderr,none": 0.02127839386358628}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.02720537153827948}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22568807339449543, "acc_stderr,none": 0.017923087667803053}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.017917974069594726}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2612244897959184, "acc_stderr,none": 0.028123429335142783}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2736318407960199, "acc_stderr,none": 0.03152439186555402}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_stem": {"acc,none": 0.2457976530288614, "acc_stderr,none": 0.0076593889177402836, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3037037037037037, "acc_stderr,none": 0.039725528847851375}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640767}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.14, "acc_stderr,none": 0.03487350880197771}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.043364327079931785}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.251063829787234, "acc_stderr,none": 0.028346963777162452}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309994}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.022717467897708617}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2161290322580645, "acc_stderr,none": 0.023415293433568518}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.21674876847290642, "acc_stderr,none": 0.028990331252516235}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.025644108639267606}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.17592592592592593, "acc_stderr,none": 0.02596742095825853}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "mmlu_pro": {"exact_match,custom-extract": 0.11053856382978723, "exact_match_stderr,custom-extract": 0.002841135573601165, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1603905160390516, "exact_match_stderr,custom-extract": 0.01371423221942838}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11026615969581749, "exact_match_stderr,custom-extract": 0.011158044019782572}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05123674911660778, "exact_match_stderr,custom-extract": 0.0065559918973592535}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.10975609756097561, "exact_match_stderr,custom-extract": 0.015456358358757447}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.16824644549763032, "exact_match_stderr,custom-extract": 0.01288417141927949}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08668730650154799, "exact_match_stderr,custom-extract": 0.009043776534229912}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13080684596577016, "exact_match_stderr,custom-extract": 0.011796749495986743}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13123359580052493, "exact_match_stderr,custom-extract": 0.017321369455841548}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08265213442325159, "exact_match_stderr,custom-extract": 0.008302286601704921}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07772020725388601, "exact_match_stderr,custom-extract": 0.0072867091916161645}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13203463203463203, "exact_match_stderr,custom-extract": 0.011142798517705913}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11222444889779559, "exact_match_stderr,custom-extract": 0.014144273960803895}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09930715935334873, "exact_match_stderr,custom-extract": 0.008301207861994725}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.17418546365914786, "exact_match_stderr,custom-extract": 0.013434393206966797}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.234, "acc_stderr,none": 0.018952741564893686, "acc_norm,none": 0.366, "acc_norm_stderr,none": 0.021564276850201614}, "piqa": {"alias": "piqa", "acc,none": 0.6746463547334058, "acc_stderr,none": 0.010931036623525191, "acc_norm,none": 0.6789989118607181, "acc_norm_stderr,none": 0.010892641574707904}, "race": {"alias": "race", "acc,none": 0.3416267942583732, "acc_stderr,none": 0.014677827770761077}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4078812691914023, "acc_stderr,none": 0.011120393600595453}, "winogrande": {"alias": "winogrande", "acc,none": 0.5943172849250198, "acc_stderr,none": 0.013800206336014208}} {"created_at": "2025-04-23T20:48:27.489615", "global_step": 246000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.318259385665529, "acc_stderr,none": 0.013611993916971451, "acc_norm,none": 0.363481228668942, "acc_norm_stderr,none": 0.014056207319068282}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6502525252525253, "acc_stderr,none": 0.009785578618940725, "acc_norm,none": 0.6195286195286195, "acc_norm_stderr,none": 0.009962305992058579}, "boolq": {"alias": "boolq", "acc,none": 0.6464831804281346, "acc_stderr,none": 0.008361346005339401}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.1891891891891892, "acc_stderr,none": 0.011213159711868613}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.0446196043338474}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4025094602668791, "acc_stderr,none": 0.004894012555642645, "acc_norm,none": 0.5257916749651463, "acc_norm_stderr,none": 0.00498313847960438}, "mmlu": {"acc,none": 0.26185728528699614, "acc_stderr,none": 0.003706316865076406, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24697130712008503, "acc_stderr,none": 0.006285026956787738, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.03852273364924316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.03401506715249039}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3284313725490196, "acc_stderr,none": 0.03296245110172229}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658342}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19834710743801653, "acc_stderr,none": 0.03640118271990945}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.03462419931615623}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.26878612716763006, "acc_stderr,none": 0.023868003262500118}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.21675977653631284, "acc_stderr,none": 0.013780598486443354}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24758842443729903, "acc_stderr,none": 0.024513879973621967}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22530864197530864, "acc_stderr,none": 0.02324620264781975}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24902216427640156, "acc_stderr,none": 0.01104489226404077}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2573099415204678, "acc_stderr,none": 0.03352799844161865}, "mmlu_other": {"acc,none": 0.2632764724814934, "acc_stderr,none": 0.007884733128122323, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.30566037735849055, "acc_stderr,none": 0.028353298073322666}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.35874439461883406, "acc_stderr,none": 0.03219079200419996}, "mmlu_management": {"alias": " - management", "acc,none": 0.24271844660194175, "acc_stderr,none": 0.04245022486384495}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.029058588303748845}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398696}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.02463004897982479}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290396}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.025767252010855963}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.036643147772880864}, "mmlu_social_sciences": {"acc,none": 0.2723431914202145, "acc_stderr,none": 0.008015874317398195, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23232323232323232, "acc_stderr,none": 0.030088629490217483}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27461139896373055, "acc_stderr,none": 0.03221024508041154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.023661296393964273}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.02755361446786379}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23669724770642203, "acc_stderr,none": 0.01822407811729908}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.0359546161177469}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27941176470588236, "acc_stderr,none": 0.018152871051538802}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.044612721759105085}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.32653061224489793, "acc_stderr,none": 0.030021056238440296}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_stem": {"acc,none": 0.2724389470345703, "acc_stderr,none": 0.007933502044868148, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137282}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03476590104304134}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.04755129616062946}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.029379170464124825}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.0360010569272777}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.022860838309232075}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2645161290322581, "acc_stderr,none": 0.025091892378859275}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293752}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22592592592592592, "acc_stderr,none": 0.025497532639609546}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2980132450331126, "acc_stderr,none": 0.03734535676787198}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3101851851851852, "acc_stderr,none": 0.031546962856566295}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.11627327127659574, "exact_match_stderr,custom-extract": 0.0029111359782472363, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.16596931659693165, "exact_match_stderr,custom-extract": 0.013904276830147692}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09759188846641319, "exact_match_stderr,custom-extract": 0.010571710152486615}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06978798586572438, "exact_match_stderr,custom-extract": 0.007576175072607238}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12682926829268293, "exact_match_stderr,custom-extract": 0.016454991959400204}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1623222748815166, "exact_match_stderr,custom-extract": 0.012700293472235506}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10319917440660474, "exact_match_stderr,custom-extract": 0.00977796396738705}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12591687041564792, "exact_match_stderr,custom-extract": 0.011606661034408551}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.01717316362524469}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11080835603996367, "exact_match_stderr,custom-extract": 0.009464280420790068}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08216136195410807, "exact_match_stderr,custom-extract": 0.007473948460954466}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12554112554112554, "exact_match_stderr,custom-extract": 0.010905908590641358}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14228456913827656, "exact_match_stderr,custom-extract": 0.0156543789197872}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.100846805234796, "exact_match_stderr,custom-extract": 0.00835815790875494}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16541353383458646, "exact_match_stderr,custom-extract": 0.013161095126056706}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.218, "acc_stderr,none": 0.01848337822317886, "acc_norm,none": 0.364, "acc_norm_stderr,none": 0.021539170637317695}, "piqa": {"alias": "piqa", "acc,none": 0.6741022850924918, "acc_stderr,none": 0.010935760218903953, "acc_norm,none": 0.6741022850924918, "acc_norm_stderr,none": 0.010935760218903958}, "race": {"alias": "race", "acc,none": 0.3492822966507177, "acc_stderr,none": 0.014754834713104488}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41299897645854655, "acc_stderr,none": 0.011141477698035243}, "winogrande": {"alias": "winogrande", "acc,none": 0.579321231254933, "acc_stderr,none": 0.01387452637200832}} {"created_at": "2025-04-23T22:40:43.900664", "global_step": 248000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.31399317406143346, "acc_stderr,none": 0.013562691224726297, "acc_norm,none": 0.35665529010238906, "acc_norm_stderr,none": 0.013998056902620204}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6452020202020202, "acc_stderr,none": 0.009817629113069696, "acc_norm,none": 0.6157407407407407, "acc_norm_stderr,none": 0.009981120724601434}, "boolq": {"alias": "boolq", "acc,none": 0.6795107033639144, "acc_stderr,none": 0.008162016261049407}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19983619983619982, "acc_stderr,none": 0.011448447996728388}, "copa": {"alias": "copa", "acc,none": 0.76, "acc_stderr,none": 0.04292346959909283}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4010157339175463, "acc_stderr,none": 0.0048910255336330295, "acc_norm,none": 0.5297749452300339, "acc_norm_stderr,none": 0.004980926198798985}, "mmlu": {"acc,none": 0.25893747329440253, "acc_stderr,none": 0.0036936616681164354, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.26503719447396384, "acc_stderr,none": 0.006430004684056622, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.0361960452412425}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2, "acc_stderr,none": 0.031234752377721175}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.31223628691983124, "acc_stderr,none": 0.03016513786784701}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.34710743801652894, "acc_stderr,none": 0.04345724570292535}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25766871165644173, "acc_stderr,none": 0.03436150827846917}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.023532925431044283}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23575418994413408, "acc_stderr,none": 0.014196375686290804}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.26688102893890675, "acc_stderr,none": 0.025122637608816632}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2932098765432099, "acc_stderr,none": 0.02532988817190092}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2711864406779661, "acc_stderr,none": 0.011354581451622981}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.034886477134579215}, "mmlu_other": {"acc,none": 0.2597360798197618, "acc_stderr,none": 0.007862478243942162, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.026341480371118352}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.03214737302029469}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.26905829596412556, "acc_stderr,none": 0.029763779406874972}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.02948036054954119}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2720306513409962, "acc_stderr,none": 0.015913367447500514}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.025457756696667885}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1801470588235294, "acc_stderr,none": 0.02334516361654486}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511115}, "mmlu_social_sciences": {"acc,none": 0.25024374390640236, "acc_stderr,none": 0.007801973220864219, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893624}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03173071239071724}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.03027690994517826}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23076923076923078, "acc_stderr,none": 0.021362027725222717}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.027722065493361266}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21467889908256882, "acc_stderr,none": 0.017604304149256487}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596919}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.018342529845275915}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.19090909090909092, "acc_stderr,none": 0.03764425585984926}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24489795918367346, "acc_stderr,none": 0.027529637440174923}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31343283582089554, "acc_stderr,none": 0.03280188205348642}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_stem": {"acc,none": 0.2575325087218522, "acc_stderr,none": 0.007780182608739817, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.03944624162501116}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640766}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2986111111111111, "acc_stderr,none": 0.03827052357950756}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.1568627450980392, "acc_stderr,none": 0.03618664819936246}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292323}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.03780019230438014}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2804232804232804, "acc_stderr,none": 0.023135287974325618}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25161290322580643, "acc_stderr,none": 0.024685979286239956}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22167487684729065, "acc_stderr,none": 0.029225575892489614}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117317}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22592592592592592, "acc_stderr,none": 0.025497532639609556}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389023}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.027467401804057982}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.041577515398656284}, "mmlu_pro": {"exact_match,custom-extract": 0.10929188829787234, "exact_match_stderr,custom-extract": 0.0028337301218970775, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1799163179916318, "exact_match_stderr,custom-extract": 0.014355153849952946}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09885931558935361, "exact_match_stderr,custom-extract": 0.010632661544075474}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0715547703180212, "exact_match_stderr,custom-extract": 0.007664187803003893}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1048780487804878, "exact_match_stderr,custom-extract": 0.015150318019731044}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14336492890995262, "exact_match_stderr,custom-extract": 0.01206995358072277}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08875128998968008, "exact_match_stderr,custom-extract": 0.009140461457488633}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1136919315403423, "exact_match_stderr,custom-extract": 0.011105705318793003}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11811023622047244, "exact_match_stderr,custom-extract": 0.016556141198042412}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10354223433242507, "exact_match_stderr,custom-extract": 0.009186019023092292}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07772020725388601, "exact_match_stderr,custom-extract": 0.007286709191616158}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12012987012987013, "exact_match_stderr,custom-extract": 0.0107012359647263}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13026052104208416, "exact_match_stderr,custom-extract": 0.01508295120552109}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09468822170900693, "exact_match_stderr,custom-extract": 0.008126615633066913}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15162907268170425, "exact_match_stderr,custom-extract": 0.01270442364591538}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.24, "acc_stderr,none": 0.019118866653759743, "acc_norm,none": 0.36, "acc_norm_stderr,none": 0.02148775108972052}, "piqa": {"alias": "piqa", "acc,none": 0.676822633297062, "acc_stderr,none": 0.01091197412428213, "acc_norm,none": 0.6828073993471164, "acc_norm_stderr,none": 0.010858155454380871}, "race": {"alias": "race", "acc,none": 0.35119617224880384, "acc_stderr,none": 0.014773430019036974}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.413510747185261, "acc_stderr,none": 0.011143517724432122}, "winogrande": {"alias": "winogrande", "acc,none": 0.5777426992896606, "acc_stderr,none": 0.013881582030658557}} {"created_at": "2025-04-24T00:24:05.816092", "global_step": 250000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3267918088737201, "acc_stderr,none": 0.013706665975587333, "acc_norm,none": 0.3703071672354949, "acc_norm_stderr,none": 0.01411129875167495}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6637205387205387, "acc_stderr,none": 0.009694178072725207, "acc_norm,none": 0.6397306397306397, "acc_norm_stderr,none": 0.009851002584732382}, "boolq": {"alias": "boolq", "acc,none": 0.6737003058103975, "acc_stderr,none": 0.008200385052427124}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2031122031122031, "acc_stderr,none": 0.011518254793634107}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.401911969727146, "acc_stderr,none": 0.00489282341554655, "acc_norm,none": 0.5256920932085242, "acc_norm_stderr,none": 0.004983189711208507}, "mmlu": {"acc,none": 0.26719840478564305, "acc_stderr,none": 0.0037335006776764907, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2686503719447396, "acc_stderr,none": 0.006458483845969061, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.03764950879790607}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3212121212121212, "acc_stderr,none": 0.03646204963253811}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3480392156862745, "acc_stderr,none": 0.03343311240488418}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955924}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302871}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507416}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615768}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.023786203255508283}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2861736334405145, "acc_stderr,none": 0.025670259242188936}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2839506172839506, "acc_stderr,none": 0.02508947852376513}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27183833116036504, "acc_stderr,none": 0.011363135278651418}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2046783625730994, "acc_stderr,none": 0.03094445977853322}, "mmlu_other": {"acc,none": 0.27132281943997427, "acc_stderr,none": 0.007973823076775891, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.027495663683724053}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23121387283236994, "acc_stderr,none": 0.03214737302029471}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.32286995515695066, "acc_stderr,none": 0.031381476375754995}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623101}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25287356321839083, "acc_stderr,none": 0.015543377313719681}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2875816993464052, "acc_stderr,none": 0.02591780611714716}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2907801418439716, "acc_stderr,none": 0.027090664368353178}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.025767252010855963}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.26506024096385544, "acc_stderr,none": 0.03436024037944967}, "mmlu_social_sciences": {"acc,none": 0.27526811829704256, "acc_stderr,none": 0.008054741692261865, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537315}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365907}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2538860103626943, "acc_stderr,none": 0.03141024780565319}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2743589743589744, "acc_stderr,none": 0.022622765767493228}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2605042016806723, "acc_stderr,none": 0.028510251512341933}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25504587155963304, "acc_stderr,none": 0.018688500856535832}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.018635594034423972}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.34545454545454546, "acc_stderr,none": 0.04554619617541054}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2571428571428571, "acc_stderr,none": 0.02797982353874455}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.29850746268656714, "acc_stderr,none": 0.032357437893550424}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_stem": {"acc,none": 0.2530922930542341, "acc_stderr,none": 0.007749244524081458, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.037857144650666516}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.24342105263157895, "acc_stderr,none": 0.034923496688842384}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566018}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364395}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.30344827586206896, "acc_stderr,none": 0.038312260488503336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.022019080012217897}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24516129032258063, "acc_stderr,none": 0.024472243840895528}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.031947400722655395}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.026067159222275794}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2980132450331126, "acc_stderr,none": 0.037345356767871984}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18055555555555555, "acc_stderr,none": 0.026232878971491666}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.04246624336697624}, "mmlu_pro": {"exact_match,custom-extract": 0.10704787234042554, "exact_match_stderr,custom-extract": 0.0028106804452317506, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.14783821478382148, "exact_match_stderr,custom-extract": 0.013264713398900195}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09252217997465145, "exact_match_stderr,custom-extract": 0.010322332141863075}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07243816254416961, "exact_match_stderr,custom-extract": 0.00770768302902095}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11707317073170732, "exact_match_stderr,custom-extract": 0.015897520483461713}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1481042654028436, "exact_match_stderr,custom-extract": 0.012233851872561227}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09700722394220847, "exact_match_stderr,custom-extract": 0.009512759072461458}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14547677261613692, "exact_match_stderr,custom-extract": 0.012335243774582408}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11023622047244094, "exact_match_stderr,custom-extract": 0.016065998434778184}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1008174386920981, "exact_match_stderr,custom-extract": 0.009078109672456583}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07994078460399703, "exact_match_stderr,custom-extract": 0.0073811700146960025}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12987012987012986, "exact_match_stderr,custom-extract": 0.011064857512116033}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.08817635270541083, "exact_match_stderr,custom-extract": 0.012706233135747383}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09006928406466513, "exact_match_stderr,custom-extract": 0.007946120960248706}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12406015037593984, "exact_match_stderr,custom-extract": 0.011676807835852744}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.248, "acc_stderr,none": 0.019332342821239103, "acc_norm,none": 0.364, "acc_norm_stderr,none": 0.021539170637317695}, "piqa": {"alias": "piqa", "acc,none": 0.6670293797606094, "acc_stderr,none": 0.010995648822619072, "acc_norm,none": 0.6784548422198041, "acc_norm_stderr,none": 0.010897500107575652}, "race": {"alias": "race", "acc,none": 0.3521531100478469, "acc_stderr,none": 0.014782629897202259}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43244626407369496, "acc_stderr,none": 0.011210331273967561}, "winogrande": {"alias": "winogrande", "acc,none": 0.5998421468034728, "acc_stderr,none": 0.013769472660464991}} {"created_at": "2025-04-24T02:35:41.343846", "global_step": 252000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3191126279863481, "acc_stderr,none": 0.013621696119173313, "acc_norm,none": 0.3643344709897611, "acc_norm_stderr,none": 0.014063260279882417}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6464646464646465, "acc_stderr,none": 0.009809728948151493, "acc_norm,none": 0.6212121212121212, "acc_norm_stderr,none": 0.00995373765654204}, "boolq": {"alias": "boolq", "acc,none": 0.4951070336391437, "acc_stderr,none": 0.008744636233555049}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20884520884520885, "acc_stderr,none": 0.011637590576063055}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.04461960433384741}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4027086237801235, "acc_stderr,none": 0.004894407257215796, "acc_norm,none": 0.528281218880701, "acc_norm_stderr,none": 0.004981793089848264}, "mmlu": {"acc,none": 0.26577410625267056, "acc_stderr,none": 0.003724816056872042, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2558979808714134, "acc_stderr,none": 0.006362298799787076, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.038932596106046734}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.296969696969697, "acc_stderr,none": 0.035679697722680474}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.031822318676475524}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29957805907172996, "acc_stderr,none": 0.029818024749753095}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070418}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.03192193448934723}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.023176298203992016}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.014310999547961447}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21864951768488747, "acc_stderr,none": 0.023475581417861102}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.27469135802469136, "acc_stderr,none": 0.024836057868294677}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25684485006518903, "acc_stderr,none": 0.011158455853098841}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.27485380116959063, "acc_stderr,none": 0.03424042924691584}, "mmlu_other": {"acc,none": 0.2735757965883489, "acc_stderr,none": 0.00799214491902263, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2981132075471698, "acc_stderr,none": 0.028152837942493864}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.03414014007044036}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.32286995515695066, "acc_stderr,none": 0.031381476375755}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.04498676320572924}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03088273697413865}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26436781609195403, "acc_stderr,none": 0.015769984840690515}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.025829163272757465}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.025892151156709405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.24632352941176472, "acc_stderr,none": 0.02617343857052}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.03175554786629921}, "mmlu_social_sciences": {"acc,none": 0.27006824829379267, "acc_stderr,none": 0.007993530109761591, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3434343434343434, "acc_stderr,none": 0.03383201223244442}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2694300518134715, "acc_stderr,none": 0.03201867122877794}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.28717948717948716, "acc_stderr,none": 0.022939925418530602}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2815126050420168, "acc_stderr,none": 0.029213549414372163}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24220183486238533, "acc_stderr,none": 0.01836817630659862}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.036412970813137296}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2369281045751634, "acc_stderr,none": 0.017201662169789775}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3224489795918367, "acc_stderr,none": 0.029923100563683906}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2935323383084577, "acc_stderr,none": 0.03220024104534205}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_stem": {"acc,none": 0.26863304789089754, "acc_stderr,none": 0.00788489025399116, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03583496176361064}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3263888888888889, "acc_stderr,none": 0.03921067198982266}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252606}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493545}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.030976692998534436}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.022019080012217904}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2806451612903226, "acc_stderr,none": 0.025560604721022884}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.0307127300709826}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.02578787422095932}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31125827814569534, "acc_stderr,none": 0.03780445850526733}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.30092592592592593, "acc_stderr,none": 0.03128039084329882}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340456}, "mmlu_pro": {"exact_match,custom-extract": 0.09981715425531915, "exact_match_stderr,custom-extract": 0.0027238072854190707, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11576011157601115, "exact_match_stderr,custom-extract": 0.011956608475421748}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10773130544993663, "exact_match_stderr,custom-extract": 0.011044744671924529}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.054770318021201414, "exact_match_stderr,custom-extract": 0.0067656574329187915}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12682926829268293, "exact_match_stderr,custom-extract": 0.016454991959400204}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1362559241706161, "exact_match_stderr,custom-extract": 0.011815618235249946}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.0846233230134159, "exact_match_stderr,custom-extract": 0.008945554797547947}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.10513447432762836, "exact_match_stderr,custom-extract": 0.010731005913982418}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10761154855643044, "exact_match_stderr,custom-extract": 0.01589697945272337}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08719346049046321, "exact_match_stderr,custom-extract": 0.00850618817194349}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0695780903034789, "exact_match_stderr,custom-extract": 0.006924833446490232}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11363636363636363, "exact_match_stderr,custom-extract": 0.010446330904020992}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11823647294589178, "exact_match_stderr,custom-extract": 0.014468953704661749}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09160892994611239, "exact_match_stderr,custom-extract": 0.008006966049388355}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15288220551378445, "exact_match_stderr,custom-extract": 0.01274738818679038}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.218, "acc_stderr,none": 0.01848337822317884, "acc_norm,none": 0.338, "acc_norm_stderr,none": 0.02117566569520941}, "piqa": {"alias": "piqa", "acc,none": 0.6773667029379761, "acc_stderr,none": 0.010907166359856606, "acc_norm,none": 0.6849836779107725, "acc_norm_stderr,none": 0.010838072746240652}, "race": {"alias": "race", "acc,none": 0.35119617224880384, "acc_stderr,none": 0.014773430019036976}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.417093142272262, "acc_stderr,none": 0.011157450926787521}, "winogrande": {"alias": "winogrande", "acc,none": 0.6037884767166535, "acc_stderr,none": 0.013746404157154968}} {"created_at": "2025-04-24T04:10:51.597273", "global_step": 254000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3122866894197952, "acc_stderr,none": 0.013542598541688065, "acc_norm,none": 0.36006825938566556, "acc_norm_stderr,none": 0.014027516814585186}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6313131313131313, "acc_stderr,none": 0.009899640855681045, "acc_norm,none": 0.6115319865319865, "acc_norm_stderr,none": 0.010001276044485226}, "boolq": {"alias": "boolq", "acc,none": 0.43119266055045874, "acc_stderr,none": 0.008661853128165593}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21621621621621623, "acc_stderr,none": 0.01178588917548664}, "copa": {"alias": "copa", "acc,none": 0.74, "acc_stderr,none": 0.04408440022768081}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.40290778729336785, "acc_stderr,none": 0.004894801119898614, "acc_norm,none": 0.5299741087432782, "acc_norm_stderr,none": 0.004980807231136746}, "mmlu": {"acc,none": 0.2674832644922376, "acc_stderr,none": 0.0037345907053208545, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25313496280552605, "acc_stderr,none": 0.006334867997514983, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.038932596106046734}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3284313725490196, "acc_stderr,none": 0.032962451101722294}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.1940928270042194, "acc_stderr,none": 0.02574490253229093}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2809917355371901, "acc_stderr,none": 0.04103203830514511}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.038260763248848646}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615769}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2630057803468208, "acc_stderr,none": 0.023703099525258155}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2558659217877095, "acc_stderr,none": 0.014593620923210756}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2958199356913183, "acc_stderr,none": 0.025922371788818777}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.024383665531035447}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24315514993481094, "acc_stderr,none": 0.01095655665441736}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.034462962170884265}, "mmlu_other": {"acc,none": 0.26295461860315417, "acc_stderr,none": 0.007903531987063445, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.25660377358490566, "acc_stderr,none": 0.026880647889051982}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.033687629322594316}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2645739910313901, "acc_stderr,none": 0.029605103217038332}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.04541609446503948}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.029480360549541194}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26947637292464877, "acc_stderr,none": 0.015866243073215058}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.024518195641879337}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2801418439716312, "acc_stderr,none": 0.026789172351140252}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23161764705882354, "acc_stderr,none": 0.025626533803777562}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2289156626506024, "acc_stderr,none": 0.03270745277352477}, "mmlu_social_sciences": {"acc,none": 0.287292817679558, "acc_stderr,none": 0.008159003005827732, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.29292929292929293, "acc_stderr,none": 0.03242497958178815}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.31088082901554404, "acc_stderr,none": 0.03340361906276586}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2923076923076923, "acc_stderr,none": 0.02306043838085774}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.029953823891887044}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27706422018348625, "acc_stderr,none": 0.019188482590169538}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.018120224251484594}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.33636363636363636, "acc_stderr,none": 0.04525393596302506}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3346938775510204, "acc_stderr,none": 0.030209235226242307}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2935323383084577, "acc_stderr,none": 0.032200241045342054}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_stem": {"acc,none": 0.27402473834443386, "acc_stderr,none": 0.007950972562547056, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.03712537833614866}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.34868421052631576, "acc_stderr,none": 0.038781398887976104}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462457}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993179}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.02924188386962882}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.291005291005291, "acc_stderr,none": 0.02339382650048487}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2838709677419355, "acc_stderr,none": 0.025649381063029258}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293752}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.036313298039696545}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.02896370257079102}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755803}, "mmlu_pro": {"exact_match,custom-extract": 0.11810172872340426, "exact_match_stderr,custom-extract": 0.0029286998558815964, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.17573221757322174, "exact_match_stderr,custom-extract": 0.014223397460075789}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09759188846641319, "exact_match_stderr,custom-extract": 0.010571710152486615}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.08392226148409894, "exact_match_stderr,custom-extract": 0.008244673307123567}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11219512195121951, "exact_match_stderr,custom-extract": 0.0156057302936758}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17417061611374407, "exact_match_stderr,custom-extract": 0.013062274992927891}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09391124871001032, "exact_match_stderr,custom-extract": 0.009375760359013234}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14180929095354522, "exact_match_stderr,custom-extract": 0.012204871709898287}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1679790026246719, "exact_match_stderr,custom-extract": 0.019177979237567238}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09536784741144415, "exact_match_stderr,custom-extract": 0.008856062181125262}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07994078460399703, "exact_match_stderr,custom-extract": 0.0073811700146960025}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1266233766233766, "exact_match_stderr,custom-extract": 0.010946036109831471}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.015561893867712498}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10392609699769054, "exact_match_stderr,custom-extract": 0.00847026264505465}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15538847117794485, "exact_match_stderr,custom-extract": 0.012832425121856766}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.238, "acc_stderr,none": 0.019064072958198456, "acc_norm,none": 0.338, "acc_norm_stderr,none": 0.02117566569520941}, "piqa": {"alias": "piqa", "acc,none": 0.6713819368879217, "acc_stderr,none": 0.010959127105167046, "acc_norm,none": 0.6724700761697497, "acc_norm_stderr,none": 0.010949830482825482}, "race": {"alias": "race", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.01488799043759141}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4278403275332651, "acc_stderr,none": 0.011195625418198208}, "winogrande": {"alias": "winogrande", "acc,none": 0.595895816890292, "acc_stderr,none": 0.013791610664670849}} {"created_at": "2025-04-24T06:02:10.313835", "global_step": 256000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.30631399317406144, "acc_stderr,none": 0.013470584417276513, "acc_norm,none": 0.3395904436860068, "acc_norm_stderr,none": 0.013839039762820167}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6367845117845118, "acc_stderr,none": 0.009868397136118806, "acc_norm,none": 0.5972222222222222, "acc_norm_stderr,none": 0.010063960494989165}, "boolq": {"alias": "boolq", "acc,none": 0.6311926605504588, "acc_stderr,none": 0.008438656079759065}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20475020475020475, "acc_stderr,none": 0.011552714477876678}, "copa": {"alias": "copa", "acc,none": 0.76, "acc_stderr,none": 0.04292346959909284}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4053973312089225, "acc_stderr,none": 0.004899653704032837, "acc_norm,none": 0.5314678350926111, "acc_norm_stderr,none": 0.004979889597551664}, "mmlu": {"acc,none": 0.24177467597208374, "acc_stderr,none": 0.0036092684549900475, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2499468650371945, "acc_stderr,none": 0.006303417485573126, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.04104947269903394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.17575757575757575, "acc_stderr,none": 0.02972094300622445}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.03213325717373616}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2975206611570248, "acc_stderr,none": 0.04173349148083497}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094634}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.02289408248992599}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.014265554192331152}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.17684887459807075, "acc_stderr,none": 0.02167005888551078}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.023468429832451152}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.011328734403140315}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.034462962170884265}, "mmlu_other": {"acc,none": 0.24332153202446088, "acc_stderr,none": 0.007688350789657515, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.025447863825108625}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774709}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3094170403587444, "acc_stderr,none": 0.03102441174057219}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822584}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24010217113665389, "acc_stderr,none": 0.015274685213734195}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.02428861946604611}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729903}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21323529411764705, "acc_stderr,none": 0.024880971512294268}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.034843315926805875}, "mmlu_social_sciences": {"acc,none": 0.2346441338966526, "acc_stderr,none": 0.0076337902889065, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.03646758875075566}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.25906735751295334, "acc_stderr,none": 0.03161877917935408}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21025641025641026, "acc_stderr,none": 0.02066059748502692}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.02720537153827948}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.20733944954128442, "acc_stderr,none": 0.01738141556360866}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.03880848301082395}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.017740899509177795}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.27346938775510204, "acc_stderr,none": 0.028535560337128438}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.03115715086935556}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_stem": {"acc,none": 0.23501427212178877, "acc_stderr,none": 0.00754957045729362, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.16296296296296298, "acc_stderr,none": 0.031905414744828386}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.03317672787533157}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.03852084696008534}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816508}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.24680851063829787, "acc_stderr,none": 0.028185441301234095}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.22486772486772486, "acc_stderr,none": 0.021502096078229147}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.20967741935483872, "acc_stderr,none": 0.02315787934908352}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.19704433497536947, "acc_stderr,none": 0.02798672466673622}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.02620276653465215}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987054}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422252}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.11145279255319149, "exact_match_stderr,custom-extract": 0.002860698233643613, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1394700139470014, "exact_match_stderr,custom-extract": 0.012946933436697635}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11913814955640051, "exact_match_stderr,custom-extract": 0.011540276571470743}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06890459363957598, "exact_match_stderr,custom-extract": 0.007531645622174554}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13414634146341464, "exact_match_stderr,custom-extract": 0.01685194412727912}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1552132701421801, "exact_match_stderr,custom-extract": 0.012471657591396939}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10526315789473684, "exact_match_stderr,custom-extract": 0.009863889056501643}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12469437652811736, "exact_match_stderr,custom-extract": 0.011558254824072241}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10761154855643044, "exact_match_stderr,custom-extract": 0.01589697945272338}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07538601271571299, "exact_match_stderr,custom-extract": 0.007960297036631309}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.09104367135455219, "exact_match_stderr,custom-extract": 0.007829418466689907}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1406926406926407, "exact_match_stderr,custom-extract": 0.011444823662931425}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12424849699398798, "exact_match_stderr,custom-extract": 0.014781596611020431}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.100846805234796, "exact_match_stderr,custom-extract": 0.008358157908754939}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13659147869674185, "exact_match_stderr,custom-extract": 0.012164403230432023}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.248, "acc_stderr,none": 0.019332342821239103, "acc_norm,none": 0.342, "acc_norm_stderr,none": 0.021236147199899254}, "piqa": {"alias": "piqa", "acc,none": 0.6708378672470077, "acc_stderr,none": 0.0109637504141347, "acc_norm,none": 0.6784548422198041, "acc_norm_stderr,none": 0.010897500107575654}, "race": {"alias": "race", "acc,none": 0.35119617224880384, "acc_stderr,none": 0.014773430019036977}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43193449334698053, "acc_stderr,none": 0.011208746100567539}, "winogrande": {"alias": "winogrande", "acc,none": 0.5998421468034728, "acc_stderr,none": 0.01376947266046499}} {"created_at": "2025-04-24T07:36:00.613159", "global_step": 258000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.31143344709897613, "acc_stderr,none": 0.013532472099850945, "acc_norm,none": 0.35921501706484643, "acc_norm_stderr,none": 0.01402022415583916}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6468855218855218, "acc_stderr,none": 0.009807078935467612, "acc_norm,none": 0.6161616161616161, "acc_norm_stderr,none": 0.009979061846649307}, "boolq": {"alias": "boolq", "acc,none": 0.5706422018348624, "acc_stderr,none": 0.008657333755353673}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19000819000819, "acc_stderr,none": 0.011231727519127847}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.045126085985421276}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4075881298546106, "acc_stderr,none": 0.0049038158859832795, "acc_norm,none": 0.5343557060346544, "acc_norm_stderr,none": 0.004977988452502641}, "mmlu": {"acc,none": 0.2588662583677539, "acc_stderr,none": 0.003691028223585181, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27013815090329435, "acc_stderr,none": 0.006466721913349616, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.034550710191021475}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2787878787878788, "acc_stderr,none": 0.03501438706296781}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.030190282453501964}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24050632911392406, "acc_stderr,none": 0.027820781981149675}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.39669421487603307, "acc_stderr,none": 0.04465869780531009}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3148148148148148, "acc_stderr,none": 0.04489931073591312}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.03623089915724147}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.29190751445086704, "acc_stderr,none": 0.02447699407624733}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2604501607717042, "acc_stderr,none": 0.024926723224845543}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2716049382716049, "acc_stderr,none": 0.02474862449053737}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27835723598435463, "acc_stderr,none": 0.011446990197380985}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.033773102522091945}, "mmlu_other": {"acc,none": 0.25329900225297713, "acc_stderr,none": 0.007779095003857061, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816508}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.25660377358490566, "acc_stderr,none": 0.026880647889051985}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.033450369167889904}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.23318385650224216, "acc_stderr,none": 0.028380391147094723}, "mmlu_management": {"alias": " - management", "acc,none": 0.18446601941747573, "acc_stderr,none": 0.03840423627288276}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.02999695185834949}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28607918263090676, "acc_stderr,none": 0.01616087140512754}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.024739981355113592}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.02624492034984301}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16176470588235295, "acc_stderr,none": 0.022368672562886757}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.03141784291663925}, "mmlu_social_sciences": {"acc,none": 0.24699382515437113, "acc_stderr,none": 0.0077731777289372275, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02962022787479047}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.03003114797764154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21794871794871795, "acc_stderr,none": 0.020932445774463196}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279472}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23853211009174313, "acc_stderr,none": 0.01827257581023186}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306085}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2973856209150327, "acc_stderr,none": 0.018492596536396955}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.040139645540727735}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.02721283588407315}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.030360490154014645}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.2591183000317158, "acc_stderr,none": 0.007796391825341833, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.32592592592592595, "acc_stderr,none": 0.040491220417025055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137283}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20851063829787234, "acc_stderr,none": 0.026556982117838742}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.02286083830923207}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24516129032258063, "acc_stderr,none": 0.024472243840895528}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.03127090713297697}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.02659393910184406}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2175925925925926, "acc_stderr,none": 0.02813968944485967}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "mmlu_pro": {"exact_match,custom-extract": 0.10488696808510638, "exact_match_stderr,custom-extract": 0.0027837567062239748, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.12133891213389121, "exact_match_stderr,custom-extract": 0.012202652228883252}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10139416983523447, "exact_match_stderr,custom-extract": 0.010752959229023348}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07332155477031801, "exact_match_stderr,custom-extract": 0.007750845159494198}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13170731707317074, "exact_match_stderr,custom-extract": 0.016721543700347657}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14336492890995262, "exact_match_stderr,custom-extract": 0.012069953580722765}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09081527347781218, "exact_match_stderr,custom-extract": 0.009235657832562172}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.15158924205378974, "exact_match_stderr,custom-extract": 0.012546605589092928}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.01717316362524469}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.06993642143505904, "exact_match_stderr,custom-extract": 0.007689740413630028}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07327905255366396, "exact_match_stderr,custom-extract": 0.007092470342788471}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1266233766233766, "exact_match_stderr,custom-extract": 0.010946036109831513}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1342685370741483, "exact_match_stderr,custom-extract": 0.015277913884522439}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09160892994611239, "exact_match_stderr,custom-extract": 0.008006966049388359}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12155388471177944, "exact_match_stderr,custom-extract": 0.011574782101911055}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.24, "acc_stderr,none": 0.019118866653759736, "acc_norm,none": 0.346, "acc_norm_stderr,none": 0.021294951277234637}, "piqa": {"alias": "piqa", "acc,none": 0.6724700761697497, "acc_stderr,none": 0.010949830482825476, "acc_norm,none": 0.6817192600652884, "acc_norm_stderr,none": 0.010868093932082233}, "race": {"alias": "race", "acc,none": 0.36650717703349284, "acc_stderr,none": 0.014912890943719231}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.417093142272262, "acc_stderr,none": 0.011157450926787521}, "winogrande": {"alias": "winogrande", "acc,none": 0.6037884767166535, "acc_stderr,none": 0.01374640415715496}} {"created_at": "2025-04-24T09:41:56.306253", "global_step": 260000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.30204778156996587, "acc_stderr,none": 0.013417519144716419, "acc_norm,none": 0.36006825938566556, "acc_norm_stderr,none": 0.014027516814585186}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6485690235690236, "acc_stderr,none": 0.009796395582817722, "acc_norm,none": 0.6216329966329966, "acc_norm_stderr,none": 0.009951575683331947}, "boolq": {"alias": "boolq", "acc,none": 0.518348623853211, "acc_stderr,none": 0.008739164562341828}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.23013923013923013, "acc_stderr,none": 0.012050956185794135}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.045126085985421276}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4078868751244772, "acc_stderr,none": 0.004904375631128866, "acc_norm,none": 0.5320653256323441, "acc_norm_stderr,none": 0.004979510001776617}, "mmlu": {"acc,none": 0.2385700042728956, "acc_stderr,none": 0.003590469629690649, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2456960680127524, "acc_stderr,none": 0.006270814761204574, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.03970158273235173}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.033175059300091805}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.031321798030832904}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3206751054852321, "acc_stderr,none": 0.03038193194999041}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.039849796533028704}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854932}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.20245398773006135, "acc_stderr,none": 0.03157065078911903}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.02344582627654555}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18006430868167203, "acc_stderr,none": 0.02182342285774494}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22530864197530864, "acc_stderr,none": 0.02324620264781975}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24641460234680573, "acc_stderr,none": 0.011005971399927235}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.27485380116959063, "acc_stderr,none": 0.03424042924691584}, "mmlu_other": {"acc,none": 0.2574831026713872, "acc_stderr,none": 0.007830915497259562, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899098}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720685}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.20388349514563106, "acc_stderr,none": 0.039891398595317706}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.02987257770889117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2707535121328225, "acc_stderr,none": 0.015889888362560486}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.024288619466046102}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880592}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.034843315926805875}, "mmlu_social_sciences": {"acc,none": 0.22326941826454338, "acc_stderr,none": 0.0075023143516892874, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.04142439719489362}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.19696969696969696, "acc_stderr,none": 0.02833560973246335}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.026653531596715484}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936087}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.017704531653250075}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721375}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.1836734693877551, "acc_stderr,none": 0.024789071332007657}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.030360490154014652}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.22423089121471615, "acc_stderr,none": 0.007414109208212061, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.034554737023254366}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19736842105263158, "acc_stderr,none": 0.03238981601699397}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237654}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21957671957671956, "acc_stderr,none": 0.021320018599770348}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2064516129032258, "acc_stderr,none": 0.023025899617188723}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.17733990147783252, "acc_stderr,none": 0.026874337276808352}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.1962962962962963, "acc_stderr,none": 0.024217421327417166}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613423}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291518}, "mmlu_pro": {"exact_match,custom-extract": 0.10987367021276596, "exact_match_stderr,custom-extract": 0.002837155691766369, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1492329149232915, "exact_match_stderr,custom-extract": 0.013316225455158426}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10519645120405577, "exact_match_stderr,custom-extract": 0.010929524923270326}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.04151943462897526, "exact_match_stderr,custom-extract": 0.0059317941141009385}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11463414634146342, "exact_match_stderr,custom-extract": 0.015752762697429746}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14691943127962084, "exact_match_stderr,custom-extract": 0.012193288704573773}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10526315789473684, "exact_match_stderr,custom-extract": 0.00986388905650163}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.15158924205378974, "exact_match_stderr,custom-extract": 0.012546605589092916}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13123359580052493, "exact_match_stderr,custom-extract": 0.01732136945584154}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09082652134423251, "exact_match_stderr,custom-extract": 0.008664297923859648}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07327905255366396, "exact_match_stderr,custom-extract": 0.0070924703427884695}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1406926406926407, "exact_match_stderr,custom-extract": 0.011444823662931414}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11823647294589178, "exact_match_stderr,custom-extract": 0.014468953704661771}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1100846805234796, "exact_match_stderr,custom-extract": 0.00868761243899811}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13408521303258145, "exact_match_stderr,custom-extract": 0.012069766280503623}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.242, "acc_stderr,none": 0.019173085678337167, "acc_norm,none": 0.368, "acc_norm_stderr,none": 0.021588982568353544}, "piqa": {"alias": "piqa", "acc,none": 0.6686615886833515, "acc_stderr,none": 0.010982077458957348, "acc_norm,none": 0.675734494015234, "acc_norm_stderr,none": 0.01092153904134797}, "race": {"alias": "race", "acc,none": 0.3464114832535885, "acc_stderr,none": 0.014726451021782803}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41146366427840325, "acc_stderr,none": 0.011135283116540202}, "winogrande": {"alias": "winogrande", "acc,none": 0.5816890292028414, "acc_stderr,none": 0.013863669961195904}} {"created_at": "2025-04-24T10:55:29.147737", "global_step": 262000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.32081911262798635, "acc_stderr,none": 0.013640943091946528, "acc_norm,none": 0.3660409556313993, "acc_norm_stderr,none": 0.01407722310847014}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6460437710437711, "acc_stderr,none": 0.009812370644174421, "acc_norm,none": 0.6355218855218855, "acc_norm_stderr,none": 0.009875729282482438}, "boolq": {"alias": "boolq", "acc,none": 0.4685015290519878, "acc_stderr,none": 0.00872768484861531}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21048321048321048, "acc_stderr,none": 0.011671038436522908}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.047258156262526066}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4080860386377216, "acc_stderr,none": 0.0049047477522869585, "acc_norm,none": 0.5318661621190998, "acc_norm_stderr,none": 0.004979637330230315}, "mmlu": {"acc,none": 0.2571571001281869, "acc_stderr,none": 0.0036823661105232857, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2688629117959617, "acc_stderr,none": 0.0064561238925623125, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.03455071019102147}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.0347769116216366}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.03182231867647553}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3080168776371308, "acc_stderr,none": 0.030052389335605702}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.371900826446281, "acc_stderr,none": 0.04412015806624504}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252628}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2822085889570552, "acc_stderr,none": 0.03536117886664743}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.023786203255508283}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2435754189944134, "acc_stderr,none": 0.014355911964767864}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.21543408360128619, "acc_stderr,none": 0.02335022547547142}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.02517104191530968}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.28292046936114734, "acc_stderr,none": 0.011503891323188978}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.033773102522091945}, "mmlu_other": {"acc,none": 0.271000965561635, "acc_stderr,none": 0.007963540088462907, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2943396226415094, "acc_stderr,none": 0.028049186315695245}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.035331333893236574}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3004484304932735, "acc_stderr,none": 0.030769352008229136}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403325}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28735632183908044, "acc_stderr,none": 0.0161824107306827}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.025058503316958143}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3049645390070922, "acc_stderr,none": 0.027464708442022135}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1948529411764706, "acc_stderr,none": 0.024060599423487424}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21686746987951808, "acc_stderr,none": 0.03208284450356365}, "mmlu_social_sciences": {"acc,none": 0.24699382515437113, "acc_stderr,none": 0.007773216658964071, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.039994238792813365}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.030532892233932053}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.25906735751295334, "acc_stderr,none": 0.03161877917935409}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.0210206726808279}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20168067226890757, "acc_stderr,none": 0.026064313406304534}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22935779816513763, "acc_stderr,none": 0.018025349724618684}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2875816993464052, "acc_stderr,none": 0.018311653053648222}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23265306122448978, "acc_stderr,none": 0.02704925791589618}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208954}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_stem": {"acc,none": 0.23596574690770694, "acc_stderr,none": 0.007549507267129097, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.0391545063041425}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.0327900040631005}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.03551446610810826}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.039505818611799616}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.029241883869628785}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.31724137931034485, "acc_stderr,none": 0.03878352372138621}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25132275132275134, "acc_stderr,none": 0.022340482339643898}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.20967741935483872, "acc_stderr,none": 0.02315787934908352}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.21182266009852216, "acc_stderr,none": 0.028748983689941075}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.024388430433987664}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.035433042343899844}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16203703703703703, "acc_stderr,none": 0.02513045365226846}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467762}, "mmlu_pro": {"exact_match,custom-extract": 0.10787898936170212, "exact_match_stderr,custom-extract": 0.002821078542880776, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.11854951185495119, "exact_match_stderr,custom-extract": 0.012080706552248572}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10773130544993663, "exact_match_stderr,custom-extract": 0.01104474467192454}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.054770318021201414, "exact_match_stderr,custom-extract": 0.006765657432918792}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1073170731707317, "exact_match_stderr,custom-extract": 0.015304578391699535}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14928909952606634, "exact_match_stderr,custom-extract": 0.012274145317879102}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10113519091847266, "exact_match_stderr,custom-extract": 0.009690822961217331}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1234718826405868, "exact_match_stderr,custom-extract": 0.01150948610074644}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14173228346456693, "exact_match_stderr,custom-extract": 0.0178917978332682}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11444141689373297, "exact_match_stderr,custom-extract": 0.009598512147633248}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0925240562546262, "exact_match_stderr,custom-extract": 0.007886385609194017}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12121212121212122, "exact_match_stderr,custom-extract": 0.010742718710399555}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12424849699398798, "exact_match_stderr,custom-extract": 0.014781596611020452}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08622016936104696, "exact_match_stderr,custom-extract": 0.007790904368268171}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13283208020050125, "exact_match_stderr,custom-extract": 0.012021922607242962}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.224, "acc_stderr,none": 0.018663994464710794, "acc_norm,none": 0.354, "acc_norm_stderr,none": 0.021407582047916447}, "piqa": {"alias": "piqa", "acc,none": 0.6724700761697497, "acc_stderr,none": 0.010949830482825476, "acc_norm,none": 0.6784548422198041, "acc_norm_stderr,none": 0.01089750010757565}, "race": {"alias": "race", "acc,none": 0.3464114832535885, "acc_stderr,none": 0.014726451021782805}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4273285568065507, "acc_stderr,none": 0.01119393034055127}, "winogrande": {"alias": "winogrande", "acc,none": 0.5919494869771112, "acc_stderr,none": 0.013812822643745027}} {"created_at": "2025-04-24T13:22:41.993134", "global_step": 264000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3165529010238908, "acc_stderr,none": 0.01359243151906808, "acc_norm,none": 0.3660409556313993, "acc_norm_stderr,none": 0.014077223108470139}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6468855218855218, "acc_stderr,none": 0.009807078935467617, "acc_norm,none": 0.6258417508417509, "acc_norm_stderr,none": 0.009929516948977625}, "boolq": {"alias": "boolq", "acc,none": 0.5593272171253822, "acc_stderr,none": 0.008683276495829013}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19656019656019655, "acc_stderr,none": 0.01137743977396399}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4095797649870544, "acc_stderr,none": 0.004907512103128346, "acc_norm,none": 0.5371439952200757, "acc_norm_stderr,none": 0.004975993795562032}, "mmlu": {"acc,none": 0.23500925794046432, "acc_stderr,none": 0.0035728869996413904, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24760892667375134, "acc_stderr,none": 0.006288627592662084, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.04104947269903394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.031922715695482995}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516302}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615767}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.02335736578587403}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.014265554192331154}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480768}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023132376234543332}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2561929595827901, "acc_stderr,none": 0.011149173153110582}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.32748538011695905, "acc_stderr,none": 0.035993357714560276}, "mmlu_other": {"acc,none": 0.2442870936594786, "acc_stderr,none": 0.007693507456433054, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106734}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.029480360549541194}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23627075351213284, "acc_stderr,none": 0.0151904737170375}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.024404394928087866}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729903}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.035509201856896294}, "mmlu_social_sciences": {"acc,none": 0.22001949951251218, "acc_stderr,none": 0.007462970906779037, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.027479603010538804}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19170984455958548, "acc_stderr,none": 0.028408953626245282}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2076923076923077, "acc_stderr,none": 0.020567539567246794}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1908256880733945, "acc_stderr,none": 0.016847676400091105}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728743}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.017555818091322273}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.21224489795918366, "acc_stderr,none": 0.026176967197866767}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.03014777593540922}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_stem": {"acc,none": 0.22169362511893434, "acc_stderr,none": 0.007389407289158748, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.035478541985608236}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2236842105263158, "acc_stderr,none": 0.033911609343436025}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566016}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179963}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2297872340425532, "acc_stderr,none": 0.02750175294441242}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21693121693121692, "acc_stderr,none": 0.02122708244944506}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.19032258064516128, "acc_stderr,none": 0.022331707611823078}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15270935960591134, "acc_stderr,none": 0.025308904539380627}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.024720713193952134}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987054}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1712962962962963, "acc_stderr,none": 0.0256953416438247}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.042466243366976256}, "mmlu_pro": {"exact_match,custom-extract": 0.11153590425531915, "exact_match_stderr,custom-extract": 0.0028621762270362693, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1492329149232915, "exact_match_stderr,custom-extract": 0.01331622545515844}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11406844106463879, "exact_match_stderr,custom-extract": 0.011324518110214694}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07508833922261485, "exact_match_stderr,custom-extract": 0.007836192107843051}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.10975609756097561, "exact_match_stderr,custom-extract": 0.015456358358757432}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1338862559241706, "exact_match_stderr,custom-extract": 0.011728478505969217}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10113519091847266, "exact_match_stderr,custom-extract": 0.009690822961217345}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1356968215158924, "exact_match_stderr,custom-extract": 0.011981380605227169}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13385826771653545, "exact_match_stderr,custom-extract": 0.017467280079326603}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10535876475930972, "exact_match_stderr,custom-extract": 0.00925685473030193}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07549962990377498, "exact_match_stderr,custom-extract": 0.007190499688409162}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12878787878787878, "exact_match_stderr,custom-extract": 0.01102550775727543}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12625250501002003, "exact_match_stderr,custom-extract": 0.014883268009546953}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09237875288683603, "exact_match_stderr,custom-extract": 0.008037130651407133}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15288220551378445, "exact_match_stderr,custom-extract": 0.012747388186790382}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.24, "acc_stderr,none": 0.019118866653759732, "acc_norm,none": 0.36, "acc_norm_stderr,none": 0.02148775108972052}, "piqa": {"alias": "piqa", "acc,none": 0.6719260065288357, "acc_stderr,none": 0.010954487135124216, "acc_norm,none": 0.675734494015234, "acc_norm_stderr,none": 0.010921539041347962}, "race": {"alias": "race", "acc,none": 0.3569377990430622, "acc_stderr,none": 0.014827656367408909}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41453428863868985, "acc_stderr,none": 0.01114756056703673}, "winogrande": {"alias": "winogrande", "acc,none": 0.6053670086819258, "acc_stderr,none": 0.013736915172371886}} {"created_at": "2025-04-24T14:35:24.165690", "global_step": 266000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3216723549488055, "acc_stderr,none": 0.013650488084494164, "acc_norm,none": 0.363481228668942, "acc_norm_stderr,none": 0.014056207319068283}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6414141414141414, "acc_stderr,none": 0.009840882301225297, "acc_norm,none": 0.6203703703703703, "acc_norm_stderr,none": 0.00995803772546857}, "boolq": {"alias": "boolq", "acc,none": 0.6685015290519878, "acc_stderr,none": 0.008233500324571524}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2153972153972154, "acc_stderr,none": 0.011769690686226967}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.04512608598542127}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.40938060147381, "acc_stderr,none": 0.0049071462293475525, "acc_norm,none": 0.5407289384584744, "acc_norm_stderr,none": 0.004973199296339963}, "mmlu": {"acc,none": 0.2601481270474291, "acc_stderr,none": 0.003698137663320722, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.259086078639745, "acc_stderr,none": 0.0063713387998249145, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.04360314860077459}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.296969696969697, "acc_stderr,none": 0.03567969772268048}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.031660096793998116}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.0284588209914603}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.30578512396694213, "acc_stderr,none": 0.04205953933884123}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.044531975073749834}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212696}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.20520231213872833, "acc_stderr,none": 0.021742519835276287}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.28156424581005585, "acc_stderr,none": 0.015042290171866125}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2315112540192926, "acc_stderr,none": 0.023956532766639133}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.19753086419753085, "acc_stderr,none": 0.022152889927898947}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2627118644067797, "acc_stderr,none": 0.011240545514995667}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.18128654970760233, "acc_stderr,none": 0.029547741687640027}, "mmlu_other": {"acc,none": 0.25716124879304797, "acc_stderr,none": 0.007833324242597892, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.30943396226415093, "acc_stderr,none": 0.028450154794118627}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.03391750322321658}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.20179372197309417, "acc_stderr,none": 0.026936111912802273}, "mmlu_management": {"alias": " - management", "acc,none": 0.32038834951456313, "acc_stderr,none": 0.0462028408228004}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.029872577708891172}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.23627075351213284, "acc_stderr,none": 0.0151904737170375}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.025457756696667864}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.22695035460992907, "acc_stderr,none": 0.024987106365642976}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.29044117647058826, "acc_stderr,none": 0.027576468622740522}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.033293941190735296}, "mmlu_social_sciences": {"acc,none": 0.2567435814104647, "acc_stderr,none": 0.00787954851039613, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.26262626262626265, "acc_stderr,none": 0.031353050095330855}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27461139896373055, "acc_stderr,none": 0.03221024508041156}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.258974358974359, "acc_stderr,none": 0.02221110681006166}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.027553614467863814}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.28807339449541286, "acc_stderr,none": 0.019416445892636025}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.20610687022900764, "acc_stderr,none": 0.03547771004159463}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.017630827375148383}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2571428571428571, "acc_stderr,none": 0.02797982353874455}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22388059701492538, "acc_stderr,none": 0.029475250236017183}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_stem": {"acc,none": 0.2679987313669521, "acc_stderr,none": 0.007883569579061538, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174022}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03583496176361064}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462456}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.04655010411319616}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2425531914893617, "acc_stderr,none": 0.028020226271200217}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.022860838309232072}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2645161290322581, "acc_stderr,none": 0.02509189237885928}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293752}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.025787874220959312}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.32450331125827814, "acc_stderr,none": 0.038227469376587525}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.03191923445686185}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.042878587513404544}, "mmlu_pro": {"exact_match,custom-extract": 0.11344747340425532, "exact_match_stderr,custom-extract": 0.00288420591140197, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1492329149232915, "exact_match_stderr,custom-extract": 0.013316225455158443}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11406844106463879, "exact_match_stderr,custom-extract": 0.011324518110214694}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06713780918727916, "exact_match_stderr,custom-extract": 0.007441509249865652}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12926829268292683, "exact_match_stderr,custom-extract": 0.01658924160093821}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14336492890995262, "exact_match_stderr,custom-extract": 0.012069953580722761}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10319917440660474, "exact_match_stderr,custom-extract": 0.009777963967387048}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12591687041564792, "exact_match_stderr,custom-extract": 0.011606661034408546}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.017173163625244667}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11171662125340599, "exact_match_stderr,custom-extract": 0.009498134639311172}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08586232420429311, "exact_match_stderr,custom-extract": 0.00762500688458845}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12554112554112554, "exact_match_stderr,custom-extract": 0.010905908590641358}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10821643286573146, "exact_match_stderr,custom-extract": 0.013920719044718375}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10238645111624327, "exact_match_stderr,custom-extract": 0.008414505495298189}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15538847117794485, "exact_match_stderr,custom-extract": 0.012832425121856773}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.232, "acc_stderr,none": 0.018896193591952045, "acc_norm,none": 0.334, "acc_norm_stderr,none": 0.021113492347743727}, "piqa": {"alias": "piqa", "acc,none": 0.6860718171926007, "acc_stderr,none": 0.010827928134189646, "acc_norm,none": 0.6806311207834603, "acc_norm_stderr,none": 0.010877964076613747}, "race": {"alias": "race", "acc,none": 0.3473684210526316, "acc_stderr,none": 0.014735977850381395}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4278403275332651, "acc_stderr,none": 0.011195625418198208}, "winogrande": {"alias": "winogrande", "acc,none": 0.6069455406471981, "acc_stderr,none": 0.013727276249108437}} {"created_at": "2025-04-24T17:21:01.609206", "global_step": 268000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3250853242320819, "acc_stderr,none": 0.013688147309729117, "acc_norm,none": 0.3728668941979522, "acc_norm_stderr,none": 0.014131176760131165}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.656986531986532, "acc_stderr,none": 0.00974096566648922, "acc_norm,none": 0.6338383838383839, "acc_norm_stderr,none": 0.009885391390947715}, "boolq": {"alias": "boolq", "acc,none": 0.6581039755351682, "acc_stderr,none": 0.00829634535556385}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202905}, "copa": {"alias": "copa", "acc,none": 0.74, "acc_stderr,none": 0.0440844002276808}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.40987851025692096, "acc_stderr,none": 0.0049080593535038375, "acc_norm,none": 0.538338976299542, "acc_norm_stderr,none": 0.004975091055697191}, "mmlu": {"acc,none": 0.23600626691354507, "acc_stderr,none": 0.0035786529807229394, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2507970244420829, "acc_stderr,none": 0.006316950570582959, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.04073524322147125}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.0340150671524904}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25316455696202533, "acc_stderr,none": 0.0283046579430353}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516302}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507437}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.02353292543104428}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19292604501607716, "acc_stderr,none": 0.022411516780911363}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02313237623454333}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2646675358539765, "acc_stderr,none": 0.011267332992845533}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.24106855487608625, "acc_stderr,none": 0.00766327941766872, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.02544786382510861}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.03063114553919882}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.31390134529147984, "acc_stderr,none": 0.031146796482972465}, "mmlu_management": {"alias": " - management", "acc,none": 0.1650485436893204, "acc_stderr,none": 0.03675668832233188}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02934311479809447}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398687}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21895424836601307, "acc_stderr,none": 0.02367908986180772}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2375886524822695, "acc_stderr,none": 0.025389512552729906}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21691176470588236, "acc_stderr,none": 0.025035845227711264}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553027}, "mmlu_social_sciences": {"acc,none": 0.22814429639259018, "acc_stderr,none": 0.007557800400983712, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748142}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.026552207828215293}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.02925282329180363}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2076923076923077, "acc_stderr,none": 0.020567539567246794}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.02684151432295894}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21834862385321102, "acc_stderr,none": 0.017712600528722717}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.017776947157528044}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.025206963154225423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208955}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.21661909292737075, "acc_stderr,none": 0.007329210495009929, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.03502553170678318}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566016}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403325}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.251063829787234, "acc_stderr,none": 0.02834696377716246}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727772}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.02113285918275444}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.16451612903225807, "acc_stderr,none": 0.021090847745939327}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1625615763546798, "acc_stderr,none": 0.025960300064605576}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.02504044387700069}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2119205298013245, "acc_stderr,none": 0.03336767086567977}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1712962962962963, "acc_stderr,none": 0.02569534164382468}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340456}, "mmlu_pro": {"exact_match,custom-extract": 0.1141123670212766, "exact_match_stderr,custom-extract": 0.0028918869130697576, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.14225941422594143, "exact_match_stderr,custom-extract": 0.01305455213366822}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09759188846641319, "exact_match_stderr,custom-extract": 0.010571710152486613}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0803886925795053, "exact_match_stderr,custom-extract": 0.008084782328031882}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1073170731707317, "exact_match_stderr,custom-extract": 0.015304578391699516}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1504739336492891, "exact_match_stderr,custom-extract": 0.012314171688222535}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1021671826625387, "exact_match_stderr,custom-extract": 0.009734547484109865}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13080684596577016, "exact_match_stderr,custom-extract": 0.011796749495986743}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.017173163625244688}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11171662125340599, "exact_match_stderr,custom-extract": 0.00949813463931116}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08364174685418209, "exact_match_stderr,custom-extract": 0.007534896840571086}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12121212121212122, "exact_match_stderr,custom-extract": 0.010742718710399559}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1342685370741483, "exact_match_stderr,custom-extract": 0.015277913884522433}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1031562740569669, "exact_match_stderr,custom-extract": 0.008442457140721595}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16040100250626566, "exact_match_stderr,custom-extract": 0.012999011684333877}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.244, "acc_stderr,none": 0.01922673489361458, "acc_norm,none": 0.334, "acc_norm_stderr,none": 0.021113492347743727}, "piqa": {"alias": "piqa", "acc,none": 0.6702937976060935, "acc_stderr,none": 0.010968357083095152, "acc_norm,none": 0.6817192600652884, "acc_norm_stderr,none": 0.010868093932082231}, "race": {"alias": "race", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.014887990437591413}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4165813715455476, "acc_stderr,none": 0.011155497599417946}, "winogrande": {"alias": "winogrande", "acc,none": 0.6037884767166535, "acc_stderr,none": 0.013746404157154956}} {"created_at": "2025-04-24T18:20:36.414307", "global_step": 270000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.34044368600682595, "acc_stderr,none": 0.01384746051889298, "acc_norm,none": 0.37627986348122866, "acc_norm_stderr,none": 0.014157022555407168}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6447811447811448, "acc_stderr,none": 0.009820245899287127, "acc_norm,none": 0.6308922558922558, "acc_norm_stderr,none": 0.009901987410242733}, "boolq": {"alias": "boolq", "acc,none": 0.690519877675841, "acc_stderr,none": 0.008085316258869078}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091192}, "copa": {"alias": "copa", "acc,none": 0.74, "acc_stderr,none": 0.0440844002276808}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4087831109340769, "acc_stderr,none": 0.0049060436130133975, "acc_norm,none": 0.5348536148177654, "acc_norm_stderr,none": 0.00497764373084859}, "mmlu": {"acc,none": 0.2452642073778664, "acc_stderr,none": 0.003625505784441234, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25738575982996814, "acc_stderr,none": 0.0063665893077057886, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.03932537680392871}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139405}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591362}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29957805907172996, "acc_stderr,none": 0.0298180247497531}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.038498560987940904}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25153374233128833, "acc_stderr,none": 0.034089978868575295}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.02425790170532337}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1832797427652733, "acc_stderr,none": 0.021974198848265826}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22530864197530864, "acc_stderr,none": 0.02324620264781975}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2666232073011734, "acc_stderr,none": 0.011293836031612138}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.34502923976608185, "acc_stderr,none": 0.036459813773888065}, "mmlu_other": {"acc,none": 0.2500804634695848, "acc_stderr,none": 0.007760065027467572, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24528301886792453, "acc_stderr,none": 0.026480357179895688}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036625}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3094170403587444, "acc_stderr,none": 0.031024411740572192}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646034}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.029872577708891155}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24265644955300128, "acc_stderr,none": 0.015329888940899868}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.023929155517351287}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.0258921511567094}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.02388688192244033}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2891566265060241, "acc_stderr,none": 0.03529486801511115}, "mmlu_social_sciences": {"acc,none": 0.2297692557686058, "acc_stderr,none": 0.007576972665747824, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.04096985139843671}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20202020202020202, "acc_stderr,none": 0.028606204289229876}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.029519282616817247}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2282051282051282, "acc_stderr,none": 0.02127839386358628}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275886}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1981651376146789, "acc_stderr,none": 0.01709057380421788}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596919}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.01812022425148458}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.18775510204081633, "acc_stderr,none": 0.025000256039546205}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.03014777593540922}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_stem": {"acc,none": 0.23755153821757058, "acc_stderr,none": 0.007568985513532479, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.038201699145179055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.032790004063100515}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774708}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2723404255319149, "acc_stderr,none": 0.029101290698386705}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.037528339580033376}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23544973544973544, "acc_stderr,none": 0.02185150982203172}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1935483870967742, "acc_stderr,none": 0.02247525852553606}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.18719211822660098, "acc_stderr,none": 0.027444924966882618}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.026466117538959905}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355154}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467763}, "mmlu_pro": {"exact_match,custom-extract": 0.10904255319148937, "exact_match_stderr,custom-extract": 0.0028365568284284656, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13249651324965134, "exact_match_stderr,custom-extract": 0.012670137504949323}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10519645120405577, "exact_match_stderr,custom-extract": 0.010929524923270312}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0812720848056537, "exact_match_stderr,custom-extract": 0.008125177440053661}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12195121951219512, "exact_match_stderr,custom-extract": 0.0161804554422017}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.13981042654028436, "exact_match_stderr,custom-extract": 0.011944090354236483}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07636738906088751, "exact_match_stderr,custom-extract": 0.008536226336689311}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13691931540342298, "exact_match_stderr,custom-extract": 0.012026715288380386}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13123359580052493, "exact_match_stderr,custom-extract": 0.017321369455841545}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.07720254314259764, "exact_match_stderr,custom-extract": 0.008047716247195589}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10880829015544041, "exact_match_stderr,custom-extract": 0.008475194574359485}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11363636363636363, "exact_match_stderr,custom-extract": 0.010446330904020982}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12424849699398798, "exact_match_stderr,custom-extract": 0.014781596611020457}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10700538876058506, "exact_match_stderr,custom-extract": 0.008580051555617757}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.12531328320802004, "exact_match_stderr,custom-extract": 0.011727235844461751}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.246, "acc_stderr,none": 0.01927981905635254, "acc_norm,none": 0.38, "acc_norm_stderr,none": 0.02172888143870171}, "piqa": {"alias": "piqa", "acc,none": 0.6713819368879217, "acc_stderr,none": 0.010959127105167044, "acc_norm,none": 0.6817192600652884, "acc_norm_stderr,none": 0.010868093932082233}, "race": {"alias": "race", "acc,none": 0.3626794258373206, "acc_stderr,none": 0.014879563111287505}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.42835209825997955, "acc_stderr,none": 0.011197308262606087}, "winogrande": {"alias": "winogrande", "acc,none": 0.6045777426992897, "acc_stderr,none": 0.013741678387545348}} {"created_at": "2025-04-24T20:37:38.314640", "global_step": 272000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.33447098976109213, "acc_stderr,none": 0.013787460322441374, "acc_norm,none": 0.3839590443686007, "acc_norm_stderr,none": 0.01421244498065189}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6586700336700336, "acc_stderr,none": 0.0097294738412429, "acc_norm,none": 0.6477272727272727, "acc_norm_stderr,none": 0.009801753933112771}, "boolq": {"alias": "boolq", "acc,none": 0.6553516819571865, "acc_stderr,none": 0.00831223533839807}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20966420966420968, "acc_stderr,none": 0.011654350093704637}, "copa": {"alias": "copa", "acc,none": 0.74, "acc_stderr,none": 0.0440844002276808}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.407787293367855, "acc_stderr,none": 0.004904189257891274, "acc_norm,none": 0.5314678350926111, "acc_norm_stderr,none": 0.004979889597551663}, "mmlu": {"acc,none": 0.2639225181598063, "acc_stderr,none": 0.003716245208930313, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27566418703506906, "acc_stderr,none": 0.006514578732282427, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.037649508797906066}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.034277431758165236}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.030587591351604243}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2911392405063291, "acc_stderr,none": 0.029571601065753374}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.34710743801652894, "acc_stderr,none": 0.04345724570292534}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.27607361963190186, "acc_stderr,none": 0.03512385283705051}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.30346820809248554, "acc_stderr,none": 0.024752411960917205}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225601}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2958199356913183, "acc_stderr,none": 0.025922371788818777}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.025171041915309684}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2757496740547588, "acc_stderr,none": 0.011413813609160989}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.035650796707083106}, "mmlu_other": {"acc,none": 0.2584486643064049, "acc_stderr,none": 0.007834134546120277, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899105}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21973094170403587, "acc_stderr,none": 0.027790177064383595}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.02948036054954119}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816508}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28735632183908044, "acc_stderr,none": 0.0161824107306827}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.0256468630971379}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3049645390070922, "acc_stderr,none": 0.027464708442022135}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16544117647058823, "acc_stderr,none": 0.022571771025494767}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233134}, "mmlu_social_sciences": {"acc,none": 0.24991875203119923, "acc_stderr,none": 0.007805808016433731, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.031156269519646843}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23834196891191708, "acc_stderr,none": 0.030748905363909895}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.22564102564102564, "acc_stderr,none": 0.02119363252514854}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.027722065493361262}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23853211009174313, "acc_stderr,none": 0.018272575810231863}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.0359546161177469}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2957516339869281, "acc_stderr,none": 0.01846315413263281}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884601}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2571428571428571, "acc_stderr,none": 0.02797982353874455}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.030769444967296014}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_stem": {"acc,none": 0.2654614652711703, "acc_stderr,none": 0.007860455594114512, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34074074074074073, "acc_stderr,none": 0.04094376269996794}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.03738520676119669}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23829787234042554, "acc_stderr,none": 0.027851252973889795}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.31724137931034485, "acc_stderr,none": 0.0387835237213862}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.022717467897708617}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24516129032258063, "acc_stderr,none": 0.024472243840895528}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.03127090713297698}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.027080372815145668}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389023}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863434}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.04246624336697625}, "mmlu_pro": {"exact_match,custom-extract": 0.1188497340425532, "exact_match_stderr,custom-extract": 0.0029418486196032523, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1603905160390516, "exact_match_stderr,custom-extract": 0.013714232219428371}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10266159695817491, "exact_match_stderr,custom-extract": 0.010812323380686575}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0715547703180212, "exact_match_stderr,custom-extract": 0.007664187803003885}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11219512195121951, "exact_match_stderr,custom-extract": 0.015605730293675806}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.15402843601895735, "exact_match_stderr,custom-extract": 0.01243267405840754}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09597523219814241, "exact_match_stderr,custom-extract": 0.009467429322574824}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14180929095354522, "exact_match_stderr,custom-extract": 0.012204871709898264}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12335958005249344, "exact_match_stderr,custom-extract": 0.016869623436798514}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.13169845594913715, "exact_match_stderr,custom-extract": 0.01019598729669261}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.09622501850481126, "exact_match_stderr,custom-extract": 0.008026150053444933}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10497835497835498, "exact_match_stderr,custom-extract": 0.010089410685404702}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1623246492985972, "exact_match_stderr,custom-extract": 0.016524009398562526}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11624326404926867, "exact_match_stderr,custom-extract": 0.008896371709498198}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14661654135338345, "exact_match_stderr,custom-extract": 0.012529520031289995}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.248, "acc_stderr,none": 0.019332342821239103, "acc_norm,none": 0.366, "acc_norm_stderr,none": 0.021564276850201614}, "piqa": {"alias": "piqa", "acc,none": 0.6741022850924918, "acc_stderr,none": 0.010935760218903948, "acc_norm,none": 0.6838955386289445, "acc_norm_stderr,none": 0.010848148455700443}, "race": {"alias": "race", "acc,none": 0.33779904306220093, "acc_stderr,none": 0.014637734314782855}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.42528147389969295, "acc_stderr,none": 0.011187027585757608}, "winogrande": {"alias": "winogrande", "acc,none": 0.6108918705603789, "acc_stderr,none": 0.013702520871485952}} {"created_at": "2025-04-25T00:21:46.835410", "global_step": 276000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.32337883959044367, "acc_stderr,none": 0.013669421630012134, "acc_norm,none": 0.3626279863481229, "acc_norm_stderr,none": 0.014049106564955005}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6372053872053872, "acc_stderr,none": 0.009865936757013942, "acc_norm,none": 0.6014309764309764, "acc_norm_stderr,none": 0.010046455400477933}, "boolq": {"alias": "boolq", "acc,none": 0.581039755351682, "acc_stderr,none": 0.008629425249245246}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21703521703521703, "acc_stderr,none": 0.011802018846529998}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.04461960433384741}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.40748854809798846, "acc_stderr,none": 0.004903628887264535, "acc_norm,none": 0.5393347938657638, "acc_norm_stderr,none": 0.0049743168079204045}, "mmlu": {"acc,none": 0.27937615724255804, "acc_stderr,none": 0.0037850674585584733, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27035069075451645, "acc_stderr,none": 0.0064714505773872414, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.042163702135578345}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624336}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.03213325717373618}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24050632911392406, "acc_stderr,none": 0.027820781981149678}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25766871165644173, "acc_stderr,none": 0.03436150827846917}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.023445826276545543}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2446927374301676, "acc_stderr,none": 0.014378169884098423}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24758842443729903, "acc_stderr,none": 0.024513879973621967}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3117283950617284, "acc_stderr,none": 0.02577311116963046}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2894393741851369, "acc_stderr,none": 0.01158265970221023}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21637426900584794, "acc_stderr,none": 0.03158149539338733}, "mmlu_other": {"acc,none": 0.28838107499195365, "acc_stderr,none": 0.008117323499672318, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2943396226415094, "acc_stderr,none": 0.02804918631569525}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.03368762932259431}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.3786407766990291, "acc_stderr,none": 0.04802694698258974}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3034188034188034, "acc_stderr,none": 0.030118210106942652}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2848020434227331, "acc_stderr,none": 0.016139174096522567}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.026568921015457152}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880592}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21323529411764705, "acc_stderr,none": 0.024880971512294275}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553027}, "mmlu_social_sciences": {"acc,none": 0.28956776080597985, "acc_stderr,none": 0.008183382963701538, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.04096985139843671}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03191178226713548}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.32642487046632124, "acc_stderr,none": 0.033840286211432945}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2717948717948718, "acc_stderr,none": 0.022556551010132354}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.02959732973097808}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.30091743119266057, "acc_stderr,none": 0.01966475136680211}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.29770992366412213, "acc_stderr,none": 0.040103589424622034}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26633986928104575, "acc_stderr,none": 0.017883188134667192}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.32653061224489793, "acc_stderr,none": 0.030021056238440307}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3034825870646766, "acc_stderr,none": 0.03251006816458618}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_stem": {"acc,none": 0.27402473834443386, "acc_stderr,none": 0.007946145654003877, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.31851851851851853, "acc_stderr,none": 0.040247784019771096}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674784}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171452}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.029379170464124815}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24338624338624337, "acc_stderr,none": 0.02210112878741543}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2967741935483871, "acc_stderr,none": 0.025988500792411898}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.031447125816782426}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.027080372815145658}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.32450331125827814, "acc_stderr,none": 0.03822746937658753}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.030225226160012414}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.043270409325787296}, "mmlu_pro": {"exact_match,custom-extract": 0.10787898936170212, "exact_match_stderr,custom-extract": 0.002812415920533155, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1506276150627615, "exact_match_stderr,custom-extract": 0.013367335774300295}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10139416983523447, "exact_match_stderr,custom-extract": 0.010752959229023348}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05565371024734982, "exact_match_stderr,custom-extract": 0.006816813274624113}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13414634146341464, "exact_match_stderr,custom-extract": 0.0168519441272791}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.15165876777251186, "exact_match_stderr,custom-extract": 0.012353933579536042}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08565531475748193, "exact_match_stderr,custom-extract": 0.008994860895662203}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14058679706601468, "exact_match_stderr,custom-extract": 0.012160802933047537}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09448818897637795, "exact_match_stderr,custom-extract": 0.015005277240142296}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10626702997275204, "exact_match_stderr,custom-extract": 0.009291949023141255}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.076980014803849, "exact_match_stderr,custom-extract": 0.007254837064543381}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10606060606060606, "exact_match_stderr,custom-extract": 0.010135151380336849}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11623246492985972, "exact_match_stderr,custom-extract": 0.014362104240159242}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.08314087759815242, "exact_match_stderr,custom-extract": 0.007663395880276773}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.18170426065162906, "exact_match_stderr,custom-extract": 0.01365867400405887}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.256, "acc_stderr,none": 0.01953692357474761, "acc_norm,none": 0.374, "acc_norm_stderr,none": 0.021660710347204484}, "piqa": {"alias": "piqa", "acc,none": 0.691512513601741, "acc_stderr,none": 0.010776164678037157, "acc_norm,none": 0.6920565832426551, "acc_norm_stderr,none": 0.010770892367463685}, "race": {"alias": "race", "acc,none": 0.33875598086124403, "acc_stderr,none": 0.014647857789710093}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4181166837256909, "acc_stderr,none": 0.01116132051027065}, "winogrande": {"alias": "winogrande", "acc,none": 0.5895816890292028, "acc_stderr,none": 0.013825107120035861}} {"created_at": "2025-04-25T02:04:00.678842", "global_step": 278000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.33361774744027306, "acc_stderr,none": 0.013778687054176543, "acc_norm,none": 0.39419795221843, "acc_norm_stderr,none": 0.014280522667467325}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.656986531986532, "acc_stderr,none": 0.009740965666489224, "acc_norm,none": 0.6279461279461279, "acc_norm_stderr,none": 0.009918187193096471}, "boolq": {"alias": "boolq", "acc,none": 0.5614678899082569, "acc_stderr,none": 0.008678720482001873}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2194922194922195, "acc_stderr,none": 0.011849997754533976}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.04512608598542127}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.410973909579765, "acc_stderr,none": 0.004910049928688081, "acc_norm,none": 0.5406293567018522, "acc_norm_stderr,none": 0.004973280417705511}, "mmlu": {"acc,none": 0.25858139866115937, "acc_stderr,none": 0.0036919492194160113, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2667375132837407, "acc_stderr,none": 0.0064444568187280405, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.03333333333333336}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.296969696969697, "acc_stderr,none": 0.03567969772268049}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.03132179803083292}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29535864978902954, "acc_stderr,none": 0.02969633871342289}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3305785123966942, "acc_stderr,none": 0.04294340845212093}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.03462419931615624}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.023176298203992005}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.31511254019292606, "acc_stderr,none": 0.02638527370346449}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.02465968518596728}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26727509778357234, "acc_stderr,none": 0.011302607515637523}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.034462962170884265}, "mmlu_other": {"acc,none": 0.2574831026713872, "acc_stderr,none": 0.007829730281781475, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24528301886792453, "acc_stderr,none": 0.026480357179895702}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.30493273542600896, "acc_stderr,none": 0.030898610882477515}, "mmlu_management": {"alias": " - management", "acc,none": 0.2524271844660194, "acc_stderr,none": 0.0430125039969088}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.02974504857267407}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2707535121328225, "acc_stderr,none": 0.015889888362560486}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.02440439492808788}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340460997}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16176470588235295, "acc_stderr,none": 0.022368672562886757}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25903614457831325, "acc_stderr,none": 0.03410646614071857}, "mmlu_social_sciences": {"acc,none": 0.25186870328241795, "acc_stderr,none": 0.00782966874384669, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537315}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23737373737373738, "acc_stderr,none": 0.030313710538198896}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803627}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2358974358974359, "acc_stderr,none": 0.021525965407408726}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.027025433498882395}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25321100917431194, "acc_stderr,none": 0.018644073041375046}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596919}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2826797385620915, "acc_stderr,none": 0.018217269552053442}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2636363636363636, "acc_stderr,none": 0.04220224692971987}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24489795918367346, "acc_stderr,none": 0.02752963744017492}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.030769444967296014}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_stem": {"acc,none": 0.25404376784015226, "acc_stderr,none": 0.00774280029353387, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542126}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3111111111111111, "acc_stderr,none": 0.03999262876617721}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640767}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462457}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.04158307533083286}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2723404255319149, "acc_stderr,none": 0.0291012906983867}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.32413793103448274, "acc_stderr,none": 0.03900432069185553}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.02278967314577657}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22903225806451613, "acc_stderr,none": 0.02390491431178265}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.031785297106427496}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21851851851851853, "acc_stderr,none": 0.025195752251823782}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.03511807571804724}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19907407407407407, "acc_stderr,none": 0.02723229846269023}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.20535714285714285, "acc_stderr,none": 0.03834241021419073}, "mmlu_pro": {"exact_match,custom-extract": 0.10920877659574468, "exact_match_stderr,custom-extract": 0.002829304987748507, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.17573221757322174, "exact_match_stderr,custom-extract": 0.014223397460075789}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.08998732572877059, "exact_match_stderr,custom-extract": 0.010194156217460268}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06448763250883392, "exact_match_stderr,custom-extract": 0.007303510883881879}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12439024390243902, "exact_match_stderr,custom-extract": 0.016318746710195602}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.15758293838862558, "exact_match_stderr,custom-extract": 0.012548863257674918}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1021671826625387, "exact_match_stderr,custom-extract": 0.009734547484109865}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1198044009779951, "exact_match_stderr,custom-extract": 0.011360957995074566}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10236220472440945, "exact_match_stderr,custom-extract": 0.015549935163883111}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09536784741144415, "exact_match_stderr,custom-extract": 0.008856062181125265}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07772020725388601, "exact_match_stderr,custom-extract": 0.007286709191616166}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11363636363636363, "exact_match_stderr,custom-extract": 0.010446330904020988}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.09218436873747494, "exact_match_stderr,custom-extract": 0.012963217262821273}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09699769053117784, "exact_match_stderr,custom-extract": 0.008214625733066366}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.17167919799498746, "exact_match_stderr,custom-extract": 0.01335761621245641}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.252, "acc_stderr,none": 0.01943572728224953, "acc_norm,none": 0.348, "acc_norm_stderr,none": 0.0213237286328075}, "piqa": {"alias": "piqa", "acc,none": 0.6833514689880305, "acc_stderr,none": 0.010853160531978481, "acc_norm,none": 0.6855277475516867, "acc_norm_stderr,none": 0.010833009065106572}, "race": {"alias": "race", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.01488799043759141}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41299897645854655, "acc_stderr,none": 0.011141477698035245}, "winogrande": {"alias": "winogrande", "acc,none": 0.5872138910812944, "acc_stderr,none": 0.013837060648682085}} {"created_at": "2025-04-25T03:44:32.206509", "global_step": 280000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3302047781569966, "acc_stderr,none": 0.013743085603760424, "acc_norm,none": 0.36945392491467577, "acc_norm_stderr,none": 0.014104578366491894}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6498316498316499, "acc_stderr,none": 0.009788295410093146, "acc_norm,none": 0.6287878787878788, "acc_norm_stderr,none": 0.009913599001845746}, "boolq": {"alias": "boolq", "acc,none": 0.6223241590214067, "acc_stderr,none": 0.00847930920828164}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21621621621621623, "acc_stderr,none": 0.011785889175486662}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41216889065923124, "acc_stderr,none": 0.004912192800263317, "acc_norm,none": 0.5396335391356304, "acc_norm_stderr,none": 0.004974080638364243}, "mmlu": {"acc,none": 0.26406494801310354, "acc_stderr,none": 0.003715414637780467, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.26971307120085014, "acc_stderr,none": 0.006470626601289889, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.033744026441394036}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.03213325717373617}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516304}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.0401910747255735}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25766871165644173, "acc_stderr,none": 0.03436150827846917}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2947976878612717, "acc_stderr,none": 0.024547617794803835}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3086816720257235, "acc_stderr,none": 0.02623696588115327}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.29012345679012347, "acc_stderr,none": 0.025251173936495022}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2790091264667536, "acc_stderr,none": 0.011455208832803552}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.25146198830409355, "acc_stderr,none": 0.033275044238468436}, "mmlu_other": {"acc,none": 0.24557450917283552, "acc_stderr,none": 0.007712400493772573, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.02634148037111836}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.03391750322321659}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21524663677130046, "acc_stderr,none": 0.027584066602208274}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.028911208802749486}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26436781609195403, "acc_stderr,none": 0.01576998484069052}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.025058503316958147}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340460997}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17279411764705882, "acc_stderr,none": 0.022966067585581756}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.1927710843373494, "acc_stderr,none": 0.03070982405056527}, "mmlu_social_sciences": {"acc,none": 0.2658433539161521, "acc_stderr,none": 0.007944292193918118, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281337}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.031156269519646836}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.33678756476683935, "acc_stderr,none": 0.03410780251836184}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.28717948717948716, "acc_stderr,none": 0.022939925418530616}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.0275536144678638}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22201834862385322, "acc_stderr,none": 0.017818849564796617}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.017952449196987862}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072774}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.363265306122449, "acc_stderr,none": 0.030789051139030802}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208954}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_stem": {"acc,none": 0.2721217887725975, "acc_stderr,none": 0.007918242273124395, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04072314811876837}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.040925639582376556}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2170212765957447, "acc_stderr,none": 0.026947483121496228}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.31724137931034485, "acc_stderr,none": 0.038783523721386215}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.02286083830923207}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2645161290322581, "acc_stderr,none": 0.02509189237885928}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03255086769970103}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.02708037281514566}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.03710185726119995}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2175925925925926, "acc_stderr,none": 0.028139689444859672}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.19642857142857142, "acc_stderr,none": 0.037709700493470194}, "mmlu_pro": {"exact_match,custom-extract": 0.11003989361702128, "exact_match_stderr,custom-extract": 0.0028452852633732346, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13668061366806136, "exact_match_stderr,custom-extract": 0.012837566181656686}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10392902408111533, "exact_match_stderr,custom-extract": 0.010871175856870051}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0812720848056537, "exact_match_stderr,custom-extract": 0.00812517744005369}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11463414634146342, "exact_match_stderr,custom-extract": 0.01575276269742975}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1552132701421801, "exact_match_stderr,custom-extract": 0.012471657591396927}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09597523219814241, "exact_match_stderr,custom-extract": 0.009467429322574824}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12836185819070906, "exact_match_stderr,custom-extract": 0.01170240387656549}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11811023622047244, "exact_match_stderr,custom-extract": 0.01655614119804243}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11989100817438691, "exact_match_stderr,custom-extract": 0.00979411485319414}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.06291635825314582, "exact_match_stderr,custom-extract": 0.006608518078813448}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11147186147186147, "exact_match_stderr,custom-extract": 0.01035898893508243}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12024048096192384, "exact_match_stderr,custom-extract": 0.014574466566662014}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10623556581986143, "exact_match_stderr,custom-extract": 0.00855281652736039}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14160401002506265, "exact_match_stderr,custom-extract": 0.012349587610134348}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.258, "acc_stderr,none": 0.019586711785215837, "acc_norm,none": 0.374, "acc_norm_stderr,none": 0.021660710347204484}, "piqa": {"alias": "piqa", "acc,none": 0.6751904243743199, "acc_stderr,none": 0.010926296238294036, "acc_norm,none": 0.676278563656148, "acc_norm_stderr,none": 0.010916765010708752}, "race": {"alias": "race", "acc,none": 0.3435406698564593, "acc_stderr,none": 0.014697475413671399}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4263050153531218, "acc_stderr,none": 0.011190503463264747}, "winogrande": {"alias": "winogrande", "acc,none": 0.5911602209944752, "acc_stderr,none": 0.013816954295135698}} {"created_at": "2025-04-25T05:50:06.281022", "global_step": 282000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3464163822525597, "acc_stderr,none": 0.013905011180063247, "acc_norm,none": 0.39761092150170646, "acc_norm_stderr,none": 0.014301752223279535}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6544612794612794, "acc_stderr,none": 0.009757948730670294, "acc_norm,none": 0.6342592592592593, "acc_norm_stderr,none": 0.009882988069418843}, "boolq": {"alias": "boolq", "acc,none": 0.5605504587155963, "acc_stderr,none": 0.008680693125810181}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22768222768222768, "acc_stderr,none": 0.012005566703357262}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.0446196043338474}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41525592511451903, "acc_stderr,none": 0.004917590378138213, "acc_norm,none": 0.5430193188607847, "acc_norm_stderr,none": 0.004971278309204196}, "mmlu": {"acc,none": 0.2630679390400228, "acc_stderr,none": 0.003710426467121802, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2720510095642933, "acc_stderr,none": 0.006485135655500519, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.03619604524124249}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29535864978902954, "acc_stderr,none": 0.02969633871342289}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.33884297520661155, "acc_stderr,none": 0.04320767807536669}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3425925925925926, "acc_stderr,none": 0.04587904741301812}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.27607361963190186, "acc_stderr,none": 0.035123852837050516}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.02425790170532337}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3086816720257235, "acc_stderr,none": 0.026236965881153266}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2345679012345679, "acc_stderr,none": 0.02357688174400572}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27835723598435463, "acc_stderr,none": 0.011446990197380985}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.033773102522091945}, "mmlu_other": {"acc,none": 0.24943675571290633, "acc_stderr,none": 0.007748527453200767, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24150943396226415, "acc_stderr,none": 0.026341480371118352}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2600896860986547, "acc_stderr,none": 0.029442495585857473}, "mmlu_management": {"alias": " - management", "acc,none": 0.1650485436893204, "acc_stderr,none": 0.036756688322331886}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.02961432369045665}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2835249042145594, "acc_stderr,none": 0.016117318166832276}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.02463004897982478}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.025257861359432414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16176470588235295, "acc_stderr,none": 0.022368672562886754}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064536}, "mmlu_social_sciences": {"acc,none": 0.26194345141371467, "acc_stderr,none": 0.007916434340599705, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.15789473684210525, "acc_stderr,none": 0.034302659784857}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070644}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.26424870466321243, "acc_stderr,none": 0.03182155050916649}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24358974358974358, "acc_stderr,none": 0.021763733684173923}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25210084033613445, "acc_stderr,none": 0.028205545033277726}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23669724770642203, "acc_stderr,none": 0.018224078117299078}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596919}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2908496732026144, "acc_stderr,none": 0.018373116915903966}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721376}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3142857142857143, "acc_stderr,none": 0.029719329422417475}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.030769444967296014}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_stem": {"acc,none": 0.26419283222327944, "acc_stderr,none": 0.007837142313078239, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34814814814814815, "acc_stderr,none": 0.041153246103369526}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3355263157894737, "acc_stderr,none": 0.038424985593952694}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.03852084696008534}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036622}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.0379328118530781}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2, "acc_stderr,none": 0.0261488180184245}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.037800192304380135}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23544973544973544, "acc_stderr,none": 0.021851509822031722}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.26129032258064516, "acc_stderr,none": 0.024993053397764826}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.24630541871921183, "acc_stderr,none": 0.030315099285617743}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.026962424325073828}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.03710185726119995}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.02876511171804696}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952685}, "mmlu_pro": {"exact_match,custom-extract": 0.11436170212765957, "exact_match_stderr,custom-extract": 0.0028895190442341746, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.16178521617852162, "exact_match_stderr,custom-extract": 0.013762285522387247}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10519645120405577, "exact_match_stderr,custom-extract": 0.010929524923270302}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06713780918727916, "exact_match_stderr,custom-extract": 0.007441509249865654}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08536585365853659, "exact_match_stderr,custom-extract": 0.013816694190586964}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.16824644549763032, "exact_match_stderr,custom-extract": 0.012884171419279476}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08565531475748193, "exact_match_stderr,custom-extract": 0.008994860895662208}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1393643031784841, "exact_match_stderr,custom-extract": 0.012116422904979631}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.017173163625244695}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11171662125340599, "exact_match_stderr,custom-extract": 0.00949813463931117}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07994078460399703, "exact_match_stderr,custom-extract": 0.0073811700146960025}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.14177489177489178, "exact_match_stderr,custom-extract": 0.011481520874881244}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12024048096192384, "exact_match_stderr,custom-extract": 0.01457446656666198}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1100846805234796, "exact_match_stderr,custom-extract": 0.008687612438998104}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14160401002506265, "exact_match_stderr,custom-extract": 0.012349587610134346}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.256, "acc_stderr,none": 0.019536923574747605, "acc_norm,none": 0.374, "acc_norm_stderr,none": 0.021660710347204484}, "piqa": {"alias": "piqa", "acc,none": 0.6806311207834603, "acc_stderr,none": 0.010877964076613742, "acc_norm,none": 0.6860718171926007, "acc_norm_stderr,none": 0.010827928134189646}, "race": {"alias": "race", "acc,none": 0.36076555023923446, "acc_stderr,none": 0.014862517074604979}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.417093142272262, "acc_stderr,none": 0.011157450926787523}, "winogrande": {"alias": "winogrande", "acc,none": 0.6037884767166535, "acc_stderr,none": 0.013746404157154947}} {"created_at": "2025-04-25T07:20:03.108876", "global_step": 284000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.33276450511945393, "acc_stderr,none": 0.01376986304619231, "acc_norm,none": 0.3703071672354949, "acc_norm_stderr,none": 0.01411129875167495}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6447811447811448, "acc_stderr,none": 0.009820245899287124, "acc_norm,none": 0.6304713804713805, "acc_norm_stderr,none": 0.009904325878447324}, "boolq": {"alias": "boolq", "acc,none": 0.5201834862385321, "acc_stderr,none": 0.008737927070893482}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21621621621621623, "acc_stderr,none": 0.01178588917548665}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.045126085985421276}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41017725552678747, "acc_stderr,none": 0.004908604732082809, "acc_norm,none": 0.5366460864369648, "acc_norm_stderr,none": 0.004976361454341358}, "mmlu": {"acc,none": 0.2623557897735365, "acc_stderr,none": 0.0037071079146553908, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27056323060573856, "acc_stderr,none": 0.006478733736280569, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.038932596106046734}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.03317505930009181}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3080168776371308, "acc_stderr,none": 0.030052389335605695}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2975206611570248, "acc_stderr,none": 0.04173349148083499}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094633}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.03462419931615623}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.02335736578587404}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25251396648044694, "acc_stderr,none": 0.014530330201468636}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2990353697749196, "acc_stderr,none": 0.026003301117885142}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2808641975308642, "acc_stderr,none": 0.02500646975579922}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2816166883963494, "acc_stderr,none": 0.011487783272786696}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03188578017686398}, "mmlu_other": {"acc,none": 0.2574831026713872, "acc_stderr,none": 0.007828549306456628, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.02749566368372405}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.30493273542600896, "acc_stderr,none": 0.030898610882477515}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646034}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.02948036054954119}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768081}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28607918263090676, "acc_stderr,none": 0.016160871405127526}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21895424836601307, "acc_stderr,none": 0.02367908986180772}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340460987}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16544117647058823, "acc_stderr,none": 0.022571771025494746}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553027}, "mmlu_social_sciences": {"acc,none": 0.2596685082872928, "acc_stderr,none": 0.00790067161515305, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365907}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.030031147977641545}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2717948717948718, "acc_stderr,none": 0.022556551010132382}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23109243697478993, "acc_stderr,none": 0.027381406927868963}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26422018348623855, "acc_stderr,none": 0.018904164171510175}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.03641297081313729}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.29248366013071897, "acc_stderr,none": 0.018403415710109783}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.33636363636363636, "acc_stderr,none": 0.04525393596302506}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.02580128347509051}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.03115715086935556}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_stem": {"acc,none": 0.2575325087218522, "acc_stderr,none": 0.007755148864794449, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.31851851851851853, "acc_stderr,none": 0.040247784019771096}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21710526315789475, "acc_stderr,none": 0.033550453048829254}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.0358687928008034}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.15, "acc_stderr,none": 0.035887028128263714}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.22127659574468084, "acc_stderr,none": 0.027136349602424056}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.03695183311650232}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776578}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2838709677419355, "acc_stderr,none": 0.02564938106302926}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.26108374384236455, "acc_stderr,none": 0.030903796952114482}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.02620276653465215}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2980132450331126, "acc_stderr,none": 0.037345356767871984}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16203703703703703, "acc_stderr,none": 0.02513045365226846}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.4017857142857143, "acc_stderr,none": 0.04653333146973646}, "mmlu_pro": {"exact_match,custom-extract": 0.11269946808510638, "exact_match_stderr,custom-extract": 0.0028737451531158746, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.14504881450488144, "exact_match_stderr,custom-extract": 0.013160465168737605}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11406844106463879, "exact_match_stderr,custom-extract": 0.011324518110214692}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07243816254416961, "exact_match_stderr,custom-extract": 0.007707683029020946}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1048780487804878, "exact_match_stderr,custom-extract": 0.015150318019731044}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14928909952606634, "exact_match_stderr,custom-extract": 0.012274145317879095}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09597523219814241, "exact_match_stderr,custom-extract": 0.009467429322574824}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.16014669926650366, "exact_match_stderr,custom-extract": 0.012830680316387452}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09973753280839895, "exact_match_stderr,custom-extract": 0.015371706524248121}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08810172570390554, "exact_match_stderr,custom-extract": 0.008546121482440766}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0843819393042191, "exact_match_stderr,custom-extract": 0.007565106428639985}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1277056277056277, "exact_match_stderr,custom-extract": 0.010985901551102238}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13627254509018036, "exact_match_stderr,custom-extract": 0.015373681322287381}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10546574287913779, "exact_match_stderr,custom-extract": 0.008525440942560938}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14411027568922305, "exact_match_stderr,custom-extract": 0.01244019591653015}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.254, "acc_stderr,none": 0.019486596801643382, "acc_norm,none": 0.37, "acc_norm_stderr,none": 0.021613289165165785}, "piqa": {"alias": "piqa", "acc,none": 0.6800870511425462, "acc_stderr,none": 0.01088287358209206, "acc_norm,none": 0.6828073993471164, "acc_norm_stderr,none": 0.010858155454380873}, "race": {"alias": "race", "acc,none": 0.3550239234449761, "acc_stderr,none": 0.014809839887617084}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4421699078812692, "acc_stderr,none": 0.011238140029326932}, "winogrande": {"alias": "winogrande", "acc,none": 0.6053670086819258, "acc_stderr,none": 0.013736915172371885}} {"created_at": "2025-04-25T08:58:53.764194", "global_step": 286000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3378839590443686, "acc_stderr,none": 0.013822047922283514, "acc_norm,none": 0.378839590443686, "acc_norm_stderr,none": 0.014175915490000324}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.656986531986532, "acc_stderr,none": 0.009740965666489224, "acc_norm,none": 0.6367845117845118, "acc_norm_stderr,none": 0.009868397136118796}, "boolq": {"alias": "boolq", "acc,none": 0.5948012232415902, "acc_stderr,none": 0.008586427929715524}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21294021294021295, "acc_stderr,none": 0.01172067944979758}, "copa": {"alias": "copa", "acc,none": 0.74, "acc_stderr,none": 0.0440844002276808}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41286596295558653, "acc_stderr,none": 0.004913429010559064, "acc_norm,none": 0.5415255925114519, "acc_norm_stderr,none": 0.004972543127767882}, "mmlu": {"acc,none": 0.256943455348241, "acc_stderr,none": 0.003681335582641745, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2786397449521785, "acc_stderr,none": 0.006533252246265169, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.03455071019102148}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03477691162163659}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.031822318676475544}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.02917868230484255}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.33884297520661155, "acc_stderr,none": 0.043207678075366705}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252627}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.03602511318806771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.29190751445086704, "acc_stderr,none": 0.024476994076247326}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3086816720257235, "acc_stderr,none": 0.026236965881153266}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.025407197798890165}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.28552803129074317, "acc_stderr,none": 0.011535751586665652}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.035087719298245626}, "mmlu_other": {"acc,none": 0.2339877695526231, "acc_stderr,none": 0.007583410588658217, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899105}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.20179372197309417, "acc_stderr,none": 0.02693611191280227}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822584}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24786324786324787, "acc_stderr,none": 0.028286324075564386}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26053639846743293, "acc_stderr,none": 0.015696008563807103}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.02505850331695815}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.02551873104953776}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.15808823529411764, "acc_stderr,none": 0.022161462608068522}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.1927710843373494, "acc_stderr,none": 0.030709824050565267}, "mmlu_social_sciences": {"acc,none": 0.23886902827429315, "acc_stderr,none": 0.007689998797676367, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365907}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.030031147977641545}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.22564102564102564, "acc_stderr,none": 0.021193632525148536}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.027025433498882385}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21467889908256882, "acc_stderr,none": 0.017604304149256487}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596918}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28104575163398693, "acc_stderr,none": 0.018185218954318082}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22448979591836735, "acc_stderr,none": 0.02671143055553841}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.030147775935409224}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_stem": {"acc,none": 0.26482714874722485, "acc_stderr,none": 0.007847516618986892, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34814814814814815, "acc_stderr,none": 0.041153246103369526}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137283}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2127659574468085, "acc_stderr,none": 0.02675439134803976}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.32413793103448274, "acc_stderr,none": 0.03900432069185553}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02306818884826111}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25483870967741934, "acc_stderr,none": 0.024790118459332208}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.30049261083743845, "acc_stderr,none": 0.03225799476233485}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2175925925925926, "acc_stderr,none": 0.02813968944485967}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755805}, "mmlu_pro": {"exact_match,custom-extract": 0.10438829787234043, "exact_match_stderr,custom-extract": 0.002780206345048723, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13110181311018132, "exact_match_stderr,custom-extract": 0.01261340333645991}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09125475285171103, "exact_match_stderr,custom-extract": 0.010258543729935026}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06537102473498234, "exact_match_stderr,custom-extract": 0.007349892115635171}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0951219512195122, "exact_match_stderr,custom-extract": 0.014506870947377817}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.12796208530805686, "exact_match_stderr,custom-extract": 0.011505210023672559}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08668730650154799, "exact_match_stderr,custom-extract": 0.009043776534229919}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.15036674816625917, "exact_match_stderr,custom-extract": 0.012504911603573}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12598425196850394, "exact_match_stderr,custom-extract": 0.017022602638569504}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10172570390554042, "exact_match_stderr,custom-extract": 0.009114303697059893}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08216136195410807, "exact_match_stderr,custom-extract": 0.00747394846095445}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.10822510822510822, "exact_match_stderr,custom-extract": 0.010225646711914642}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11422845691382766, "exact_match_stderr,custom-extract": 0.014253888115016513}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09237875288683603, "exact_match_stderr,custom-extract": 0.008037130651407133}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14285714285714285, "exact_match_stderr,custom-extract": 0.012395054038085987}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.252, "acc_stderr,none": 0.019435727282249522, "acc_norm,none": 0.368, "acc_norm_stderr,none": 0.021588982568353548}, "piqa": {"alias": "piqa", "acc,none": 0.6779107725788901, "acc_stderr,none": 0.010902341695103427, "acc_norm,none": 0.6855277475516867, "acc_norm_stderr,none": 0.010833009065106574}, "race": {"alias": "race", "acc,none": 0.34545454545454546, "acc_stderr,none": 0.014716858425461329}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43193449334698053, "acc_stderr,none": 0.011208746100567539}, "winogrande": {"alias": "winogrande", "acc,none": 0.6045777426992897, "acc_stderr,none": 0.013741678387545348}} {"created_at": "2025-04-25T11:17:05.643429", "global_step": 288000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.34215017064846415, "acc_stderr,none": 0.013864152159177278, "acc_norm,none": 0.38310580204778155, "acc_norm_stderr,none": 0.01420647266167288}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.648989898989899, "acc_stderr,none": 0.009793703885101033, "acc_norm,none": 0.625, "acc_norm_stderr,none": 0.009933992677987828}, "boolq": {"alias": "boolq", "acc,none": 0.6969418960244649, "acc_stderr,none": 0.008038106885931548}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2022932022932023, "acc_stderr,none": 0.01150091452526044}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4076877116112328, "acc_stderr,none": 0.004904002676184326, "acc_norm,none": 0.5414260107548298, "acc_norm_stderr,none": 0.004972625848702652}, "mmlu": {"acc,none": 0.25872382851445663, "acc_stderr,none": 0.003691393845847513, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27438894792773644, "acc_stderr,none": 0.0065031796887945, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.03567016675276864}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3305785123966942, "acc_stderr,none": 0.04294340845212095}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252627}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.31901840490797545, "acc_stderr,none": 0.03661997551073836}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.31189710610932475, "acc_stderr,none": 0.026311858071854155}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2808641975308642, "acc_stderr,none": 0.025006469755799208}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2835723598435463, "acc_stderr,none": 0.011511900775968309}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824563}, "mmlu_other": {"acc,none": 0.24589636305117477, "acc_stderr,none": 0.007713766467375208, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.02619980880756192}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.0332055644308557}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2242152466367713, "acc_stderr,none": 0.027991534258519527}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822585}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.02860595370200426}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036624}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2669220945083014, "acc_stderr,none": 0.015818450894777562}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.02505850331695815}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2978723404255319, "acc_stderr,none": 0.027281608344469414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17279411764705882, "acc_stderr,none": 0.022966067585581774}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.19879518072289157, "acc_stderr,none": 0.031069390260789406}, "mmlu_social_sciences": {"acc,none": 0.2430939226519337, "acc_stderr,none": 0.00773196708219113, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893624}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365907}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.03027690994517826}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23119266055045873, "acc_stderr,none": 0.01807575024163316}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.0359546161177469}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2875816993464052, "acc_stderr,none": 0.018311653053648222}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884603}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2571428571428571, "acc_stderr,none": 0.02797982353874455}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401464}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_stem": {"acc,none": 0.26324135743736127, "acc_stderr,none": 0.007834249164136737, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34074074074074073, "acc_stderr,none": 0.04094376269996794}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.0373852067611967}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.028809989854102973}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03855289616378948}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2804232804232804, "acc_stderr,none": 0.02313528797432563}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25161290322580643, "acc_stderr,none": 0.024685979286239956}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.030108330718011625}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.02646611753895991}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763743}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2175925925925926, "acc_stderr,none": 0.028139689444859672}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "mmlu_pro": {"exact_match,custom-extract": 0.11461103723404255, "exact_match_stderr,custom-extract": 0.0028915801044455095, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.15202231520223153, "exact_match_stderr,custom-extract": 0.013418048947063364}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11406844106463879, "exact_match_stderr,custom-extract": 0.011324518110214692}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.053003533568904596, "exact_match_stderr,custom-extract": 0.006661856730672938}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.14146341463414633, "exact_match_stderr,custom-extract": 0.01723216394465977}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.15165876777251186, "exact_match_stderr,custom-extract": 0.012353933579536028}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10526315789473684, "exact_match_stderr,custom-extract": 0.009863889056501643}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13691931540342298, "exact_match_stderr,custom-extract": 0.01202671528838038}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11023622047244094, "exact_match_stderr,custom-extract": 0.016065998434778208}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1044504995458674, "exact_match_stderr,custom-extract": 0.00922154555624461}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08216136195410807, "exact_match_stderr,custom-extract": 0.007473948460954452}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12445887445887446, "exact_match_stderr,custom-extract": 0.010865516089885905}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.16432865731462926, "exact_match_stderr,custom-extract": 0.01660579746466121}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09930715935334873, "exact_match_stderr,custom-extract": 0.008301207861994751}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15789473684210525, "exact_match_stderr,custom-extract": 0.012916292072532708}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.246, "acc_stderr,none": 0.01927981905635256, "acc_norm,none": 0.352, "acc_norm_stderr,none": 0.021380042385946044}, "piqa": {"alias": "piqa", "acc,none": 0.6708378672470077, "acc_stderr,none": 0.010963750414134703, "acc_norm,none": 0.6751904243743199, "acc_norm_stderr,none": 0.010926296238294036}, "race": {"alias": "race", "acc,none": 0.3569377990430622, "acc_stderr,none": 0.014827656367408903}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4124872057318321, "acc_stderr,none": 0.011139425259170728}, "winogrande": {"alias": "winogrande", "acc,none": 0.595895816890292, "acc_stderr,none": 0.013791610664670847}} {"created_at": "2025-04-25T12:50:06.037329", "global_step": 290000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.35580204778157, "acc_stderr,none": 0.013990571137918758, "acc_norm,none": 0.378839590443686, "acc_norm_stderr,none": 0.014175915490000322}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6485690235690236, "acc_stderr,none": 0.009796395582817722, "acc_norm,none": 0.6292087542087542, "acc_norm_stderr,none": 0.00991129282205692}, "boolq": {"alias": "boolq", "acc,none": 0.6681957186544343, "acc_stderr,none": 0.008235412870849402}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2031122031122031, "acc_stderr,none": 0.01151825479363412}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.04512608598542127}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4075881298546106, "acc_stderr,none": 0.004903815885983278, "acc_norm,none": 0.5445130452101175, "acc_norm_stderr,none": 0.004969968458256172}, "mmlu": {"acc,none": 0.2629255091867255, "acc_stderr,none": 0.0037136891885582277, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2641870350690754, "acc_stderr,none": 0.006424632177655711, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1746031746031746, "acc_stderr,none": 0.03395490020856111}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.03401506715249039}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.03198001660115071}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29957805907172996, "acc_stderr,none": 0.029818024749753102}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2809917355371901, "acc_stderr,none": 0.04103203830514512}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507437}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3128834355828221, "acc_stderr,none": 0.036429145782924055}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.29190751445086704, "acc_stderr,none": 0.024476994076247323}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.26366559485530544, "acc_stderr,none": 0.02502553850053234}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.30864197530864196, "acc_stderr,none": 0.025702640260603767}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2503259452411995, "acc_stderr,none": 0.011064151027165434}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.033773102522091945}, "mmlu_other": {"acc,none": 0.2700354039266173, "acc_stderr,none": 0.00796100245594172, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.025907897122408173}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483098}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2825112107623318, "acc_stderr,none": 0.03021683101150876}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646034}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3418803418803419, "acc_stderr,none": 0.031075028526507755}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.27330779054916987, "acc_stderr,none": 0.015936681062628556}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.02505850331695815}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3120567375886525, "acc_stderr,none": 0.027640120545169924}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.025767252010855963}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.034605799075530276}, "mmlu_social_sciences": {"acc,none": 0.25706857328566785, "acc_stderr,none": 0.007874252667203553, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.03775205013583638}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365904}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23834196891191708, "acc_stderr,none": 0.030748905363909902}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2512820512820513, "acc_stderr,none": 0.02199201666237056}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176896}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22935779816513763, "acc_stderr,none": 0.018025349724618684}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2748091603053435, "acc_stderr,none": 0.03915345408847835}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3088235294117647, "acc_stderr,none": 0.018690850273595287}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.041220665028782834}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23265306122448978, "acc_stderr,none": 0.02704925791589618}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.030769444967296014}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_stem": {"acc,none": 0.2597526165556613, "acc_stderr,none": 0.007812533780891232, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.31851851851851853, "acc_stderr,none": 0.0402477840197711}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.0355418036802569}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.042801058373643966}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23829787234042554, "acc_stderr,none": 0.027851252973889774}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.03752833958003336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.02278967314577657}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27419354838709675, "acc_stderr,none": 0.025378139970885196}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22167487684729065, "acc_stderr,none": 0.029225575892489596}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.026335739404055803}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2980132450331126, "acc_stderr,none": 0.037345356767871984}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863448}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.04157751539865629}, "mmlu_pro": {"exact_match,custom-extract": 0.11070478723404255, "exact_match_stderr,custom-extract": 0.0028466752562638564, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.13528591352859135, "exact_match_stderr,custom-extract": 0.012782212846937764}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09885931558935361, "exact_match_stderr,custom-extract": 0.010632661544075493}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06802120141342756, "exact_match_stderr,custom-extract": 0.007486759168004502}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.10975609756097561, "exact_match_stderr,custom-extract": 0.01545635835875746}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1646919431279621, "exact_match_stderr,custom-extract": 0.012774553358893336}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1001031991744066, "exact_match_stderr,custom-extract": 0.009646786210097203}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1356968215158924, "exact_match_stderr,custom-extract": 0.011981380605227184}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12598425196850394, "exact_match_stderr,custom-extract": 0.017022602638569518}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08810172570390554, "exact_match_stderr,custom-extract": 0.008546121482440766}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0695780903034789, "exact_match_stderr,custom-extract": 0.006924833446490222}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11904761904761904, "exact_match_stderr,custom-extract": 0.010659472740112147}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1523046092184369, "exact_match_stderr,custom-extract": 0.016101330436514013}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09699769053117784, "exact_match_stderr,custom-extract": 0.00821462573306637}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.17167919799498746, "exact_match_stderr,custom-extract": 0.013357616212456423}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.246, "acc_stderr,none": 0.01927981905635256, "acc_norm,none": 0.344, "acc_norm_stderr,none": 0.02126575803797874}, "piqa": {"alias": "piqa", "acc,none": 0.6871599564744287, "acc_stderr,none": 0.010817714425701095, "acc_norm,none": 0.6811751904243744, "acc_norm_stderr,none": 0.010873037534333418}, "race": {"alias": "race", "acc,none": 0.3569377990430622, "acc_stderr,none": 0.014827656367408905}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43807574206755373, "acc_stderr,none": 0.01122696506802993}, "winogrande": {"alias": "winogrande", "acc,none": 0.606156274664562, "acc_stderr,none": 0.01373211447266875}} {"created_at": "2025-04-25T14:53:18.663047", "global_step": 292000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3165529010238908, "acc_stderr,none": 0.01359243151906808, "acc_norm,none": 0.3677474402730375, "acc_norm_stderr,none": 0.014090995618168472}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.5938552188552189, "acc_stderr,none": 0.010077409815364063, "acc_norm,none": 0.563973063973064, "acc_norm_stderr,none": 0.010175459582759734}, "boolq": {"alias": "boolq", "acc,none": 0.573394495412844, "acc_stderr,none": 0.008650327037726275}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2031122031122031, "acc_stderr,none": 0.011518254793634101}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41127265484963155, "acc_stderr,none": 0.004910588449330015, "acc_norm,none": 0.5419239195379406, "acc_norm_stderr,none": 0.004972210244020563}, "mmlu": {"acc,none": 0.2667711152257513, "acc_stderr,none": 0.003725313729516391, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25292242295430395, "acc_stderr,none": 0.006337722275005978, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.040735243221471276}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.03287666758603489}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.02998373305591362}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2320675105485232, "acc_stderr,none": 0.02747974455080851}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19008264462809918, "acc_stderr,none": 0.03581796951709282}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.042365112580946336}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2822085889570552, "acc_stderr,none": 0.03536117886664743}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.023532925431044287}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.27262569832402234, "acc_stderr,none": 0.01489339173524962}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2733118971061093, "acc_stderr,none": 0.02531176597542612}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24382716049382716, "acc_stderr,none": 0.02389187954195961}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25097783572359844, "acc_stderr,none": 0.01107373029918723}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.1871345029239766, "acc_stderr,none": 0.029913127232368043}, "mmlu_other": {"acc,none": 0.266816865143225, "acc_stderr,none": 0.00792588854460801, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2830188679245283, "acc_stderr,none": 0.027724236492700904}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3236994219653179, "acc_stderr,none": 0.03567603799639171}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.23766816143497757, "acc_stderr,none": 0.028568079464714288}, "mmlu_management": {"alias": " - management", "acc,none": 0.33980582524271846, "acc_stderr,none": 0.04689765937278132}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2503192848020434, "acc_stderr,none": 0.015491088951494593}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.025160998214292456}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.29432624113475175, "acc_stderr,none": 0.027187127011503796}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.30514705882352944, "acc_stderr,none": 0.027971541370170598}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.18072289156626506, "acc_stderr,none": 0.029955737855810138}, "mmlu_social_sciences": {"acc,none": 0.2723431914202145, "acc_stderr,none": 0.007999245930254383, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.15789473684210525, "acc_stderr,none": 0.03430265978485698}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365904}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.29533678756476683, "acc_stderr,none": 0.03292296639155141}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.28717948717948716, "acc_stderr,none": 0.02293992541853063}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2815126050420168, "acc_stderr,none": 0.029213549414372153}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27522935779816515, "acc_stderr,none": 0.019149093743155196}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.016992723465466233}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.043091187099464585}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3836734693877551, "acc_stderr,none": 0.031130880396235943}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3034825870646766, "acc_stderr,none": 0.03251006816458618}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_stem": {"acc,none": 0.281953694893752, "acc_stderr,none": 0.007992574926239443, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174021}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.0378272898086547}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.042207736591714506}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.251063829787234, "acc_stderr,none": 0.028346963777162452}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.02345603738398203}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27419354838709675, "acc_stderr,none": 0.025378139970885196}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3054187192118227, "acc_stderr,none": 0.032406615658684086}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.026067159222275794}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.32450331125827814, "acc_stderr,none": 0.038227469376587525}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.38425925925925924, "acc_stderr,none": 0.03317354514310742}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.125, "acc_stderr,none": 0.03139045014587016}, "mmlu_pro": {"exact_match,custom-extract": 0.11461103723404255, "exact_match_stderr,custom-extract": 0.002892327412918182, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.18270571827057183, "exact_match_stderr,custom-extract": 0.014441383098049937}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11406844106463879, "exact_match_stderr,custom-extract": 0.011324518110214692}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06448763250883392, "exact_match_stderr,custom-extract": 0.007303510883881888}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1, "exact_match_stderr,custom-extract": 0.014834045293024465}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.13270142180094788, "exact_match_stderr,custom-extract": 0.011684451168703907}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10629514963880289, "exact_match_stderr,custom-extract": 0.009906405536989354}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14180929095354522, "exact_match_stderr,custom-extract": 0.012204871709898282}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11811023622047244, "exact_match_stderr,custom-extract": 0.01655614119804243}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10535876475930972, "exact_match_stderr,custom-extract": 0.009256854730301916}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07920059215396003, "exact_match_stderr,custom-extract": 0.007349873183910956}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11688311688311688, "exact_match_stderr,custom-extract": 0.010575091539720229}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13627254509018036, "exact_match_stderr,custom-extract": 0.015373681322287381}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10623556581986143, "exact_match_stderr,custom-extract": 0.008552816527360388}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16416040100250626, "exact_match_stderr,custom-extract": 0.013120987227657865}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.232, "acc_stderr,none": 0.018896193591952045, "acc_norm,none": 0.36, "acc_norm_stderr,none": 0.021487751089720522}, "piqa": {"alias": "piqa", "acc,none": 0.6855277475516867, "acc_stderr,none": 0.010833009065106574, "acc_norm,none": 0.6860718171926007, "acc_norm_stderr,none": 0.010827928134189646}, "race": {"alias": "race", "acc,none": 0.3588516746411483, "acc_stderr,none": 0.014845215125262313}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4242579324462641, "acc_stderr,none": 0.01118350266234179}, "winogrande": {"alias": "winogrande", "acc,none": 0.611681136543015, "acc_stderr,none": 0.013697456658457232}} {"created_at": "2025-04-25T16:39:01.855562", "global_step": 294000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.33532423208191126, "acc_stderr,none": 0.013796182947785562, "acc_norm,none": 0.378839590443686, "acc_norm_stderr,none": 0.014175915490000322}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6616161616161617, "acc_stderr,none": 0.009709034670525096, "acc_norm,none": 0.6418350168350169, "acc_norm_stderr,none": 0.009838331651451848}, "boolq": {"alias": "boolq", "acc,none": 0.5752293577981651, "acc_stderr,none": 0.008645503833361108}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2022932022932023, "acc_stderr,none": 0.011500914525260439}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.04512608598542127}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4113722366062537, "acc_stderr,none": 0.004910767540867421, "acc_norm,none": 0.5396335391356304, "acc_norm_stderr,none": 0.004974080638364257}, "mmlu": {"acc,none": 0.26940606751175045, "acc_stderr,none": 0.003739451632124415, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2684378320935175, "acc_stderr,none": 0.006457760608620661, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.038522733649243156}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3088235294117647, "acc_stderr,none": 0.03242661719827218}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.029041333510598028}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3140495867768595, "acc_stderr,none": 0.04236964753041018}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.040191074725573483}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23410404624277456, "acc_stderr,none": 0.022797110278071134}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.01442229220480885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3279742765273312, "acc_stderr,none": 0.026664410886937606}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2839506172839506, "acc_stderr,none": 0.02508947852376513}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27509778357235987, "acc_stderr,none": 0.011405443620996937}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03377310252209195}, "mmlu_other": {"acc,none": 0.2632764724814934, "acc_stderr,none": 0.007885491402250578, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.16, "acc_stderr,none": 0.036845294917747094}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2792452830188679, "acc_stderr,none": 0.027611163402399715}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.034140140070440354}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2914798206278027, "acc_stderr,none": 0.030500283176545906}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.04541609446503949}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.02974504857267404}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2771392081736909, "acc_stderr,none": 0.016005636294122428}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.025360603796242557}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.02551873104953777}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.023886881922440366}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.19879518072289157, "acc_stderr,none": 0.031069390260789406}, "mmlu_social_sciences": {"acc,none": 0.26356841078973026, "acc_stderr,none": 0.007939994506783699, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.03646758875075566}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.26262626262626265, "acc_stderr,none": 0.03135305009533086}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24352331606217617, "acc_stderr,none": 0.030975436386845436}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2512820512820513, "acc_stderr,none": 0.021992016662370564}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.028657491285071973}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26605504587155965, "acc_stderr,none": 0.018946022322225593}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28594771241830064, "acc_stderr,none": 0.018280485072954673}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.04631381319425464}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2612244897959184, "acc_stderr,none": 0.028123429335142787}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916707}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_stem": {"acc,none": 0.2825880114176974, "acc_stderr,none": 0.008004370498510834, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.04171654161354543}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.32894736842105265, "acc_stderr,none": 0.03823428969926605}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.038009680605548594}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23829787234042554, "acc_stderr,none": 0.02785125297388978}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309994}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.022930973071633345}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2967741935483871, "acc_stderr,none": 0.025988500792411894}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3399014778325123, "acc_stderr,none": 0.033327690684107895}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.3443708609271523, "acc_stderr,none": 0.03879687024073327}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863434}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03894641120044792}, "mmlu_pro": {"exact_match,custom-extract": 0.09948470744680851, "exact_match_stderr,custom-extract": 0.002718404531217859, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1492329149232915, "exact_match_stderr,custom-extract": 0.013316225455158433}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09252217997465145, "exact_match_stderr,custom-extract": 0.01032233214186308}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06890459363957598, "exact_match_stderr,custom-extract": 0.007531645622174544}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.09024390243902439, "exact_match_stderr,custom-extract": 0.014168039768581504}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.11255924170616113, "exact_match_stderr,custom-extract": 0.010885452262179492}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.07739938080495357, "exact_match_stderr,custom-extract": 0.008588907694718314}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13814180929095354, "exact_match_stderr,custom-extract": 0.012071728192879428}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.08661417322834646, "exact_match_stderr,custom-extract": 0.014428786853508816}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08537693006357856, "exact_match_stderr,custom-extract": 0.008425486761039526}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07846039970392302, "exact_match_stderr,custom-extract": 0.007318387060822788}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11038961038961038, "exact_match_stderr,custom-extract": 0.010314856083401136}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1342685370741483, "exact_match_stderr,custom-extract": 0.015277913884522428}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07390300230946882, "exact_match_stderr,custom-extract": 0.007261426284059237}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15162907268170425, "exact_match_stderr,custom-extract": 0.01270442364591539}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.248, "acc_stderr,none": 0.019332342821239103, "acc_norm,none": 0.376, "acc_norm_stderr,none": 0.02168382753928612}, "piqa": {"alias": "piqa", "acc,none": 0.6795429815016322, "acc_stderr,none": 0.010887766073814887, "acc_norm,none": 0.676822633297062, "acc_norm_stderr,none": 0.01091197412428213}, "race": {"alias": "race", "acc,none": 0.3473684210526316, "acc_stderr,none": 0.014735977850381398}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4278403275332651, "acc_stderr,none": 0.011195625418198208}, "winogrande": {"alias": "winogrande", "acc,none": 0.6179952644041041, "acc_stderr,none": 0.013655578215970418}} {"created_at": "2025-04-25T18:41:15.500625", "global_step": 296000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3506825938566553, "acc_stderr,none": 0.013944635930726087, "acc_norm,none": 0.3890784982935154, "acc_norm_stderr,none": 0.014247309976045607}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6813973063973064, "acc_stderr,none": 0.009560775507673362, "acc_norm,none": 0.656986531986532, "acc_norm_stderr,none": 0.009740965666489234}, "boolq": {"alias": "boolq", "acc,none": 0.5703363914373089, "acc_stderr,none": 0.00865809540849789}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21048321048321048, "acc_stderr,none": 0.011671038436522905}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4148575980880303, "acc_stderr,none": 0.004916905095810844, "acc_norm,none": 0.5466042620991834, "acc_norm_stderr,none": 0.0049680589444721585}, "mmlu": {"acc,none": 0.2796610169491525, "acc_stderr,none": 0.0037807292567838997, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.26482465462274174, "acc_stderr,none": 0.0064316403073350105, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30952380952380953, "acc_stderr,none": 0.04134913018303316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624336}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.03198001660115072}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3206751054852321, "acc_stderr,none": 0.030381931949990414}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.03520893951097652}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094633}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.0332201579577674}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.26927374301675977, "acc_stderr,none": 0.014835616582882613}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2315112540192926, "acc_stderr,none": 0.023956532766639133}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24691358024691357, "acc_stderr,none": 0.023993501709042117}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2627118644067797, "acc_stderr,none": 0.011240545514995664}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.25146198830409355, "acc_stderr,none": 0.033275044238468436}, "mmlu_other": {"acc,none": 0.29127775989700677, "acc_stderr,none": 0.008133646896882955, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3018867924528302, "acc_stderr,none": 0.028254200344438655}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.03391750322321661}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21524663677130046, "acc_stderr,none": 0.027584066602208256}, "mmlu_management": {"alias": " - management", "acc,none": 0.3883495145631068, "acc_stderr,none": 0.0482572933735639}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23504273504273504, "acc_stderr,none": 0.027778835904935427}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939098}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2822477650063857, "acc_stderr,none": 0.016095302969878537}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.32679738562091504, "acc_stderr,none": 0.02685729466328142}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.34558823529411764, "acc_stderr,none": 0.02888819310398865}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.034843315926805875}, "mmlu_social_sciences": {"acc,none": 0.29574260643483913, "acc_stderr,none": 0.008211510013051104, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518752}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070644}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.31088082901554404, "acc_stderr,none": 0.03340361906276587}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2923076923076923, "acc_stderr,none": 0.02306043838085774}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31092436974789917, "acc_stderr,none": 0.030066761582977934}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3229357798165138, "acc_stderr,none": 0.020048115923415336}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.31297709923664124, "acc_stderr,none": 0.04066962905677697}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.238562091503268, "acc_stderr,none": 0.017242385828779593}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721376}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.35918367346938773, "acc_stderr,none": 0.03071356045510849}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3283582089552239, "acc_stderr,none": 0.033206858897443244}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_stem": {"acc,none": 0.2746590548683793, "acc_stderr,none": 0.007933369574511876, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3111111111111111, "acc_stderr,none": 0.03999262876617722}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.037385206761196686}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566016}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.04655010411319616}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.21, "acc_stderr,none": 0.04093601807403326}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20851063829787234, "acc_stderr,none": 0.026556982117838728}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948368}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3225806451612903, "acc_stderr,none": 0.02659308451657228}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.03178529710642751}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.02488211685765507}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.304635761589404, "acc_stderr,none": 0.03757949922943343}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.03191923445686186}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25, "acc_stderr,none": 0.04109974682633932}, "mmlu_pro": {"exact_match,custom-extract": 0.11735372340425532, "exact_match_stderr,custom-extract": 0.002916809154640323, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.18270571827057183, "exact_match_stderr,custom-extract": 0.014441383098049959}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11787072243346007, "exact_match_stderr,custom-extract": 0.011486983100199012}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.060954063604240286, "exact_match_stderr,custom-extract": 0.007113993242445063}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08780487804878048, "exact_match_stderr,custom-extract": 0.013993989404782784}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.16113744075829384, "exact_match_stderr,custom-extract": 0.012662802960588726}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09700722394220847, "exact_match_stderr,custom-extract": 0.009512759072461451}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1625916870415648, "exact_match_stderr,custom-extract": 0.012909421300697388}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.08136482939632546, "exact_match_stderr,custom-extract": 0.014024845803977639}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11807447774750227, "exact_match_stderr,custom-extract": 0.00972965922285408}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07994078460399703, "exact_match_stderr,custom-extract": 0.0073811700146960025}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11147186147186147, "exact_match_stderr,custom-extract": 0.01035898893508243}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1342685370741483, "exact_match_stderr,custom-extract": 0.015277913884522447}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10854503464203233, "exact_match_stderr,custom-extract": 0.008634105256024198}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.17543859649122806, "exact_match_stderr,custom-extract": 0.013472398259823286}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.244, "acc_stderr,none": 0.019226734893614594, "acc_norm,none": 0.354, "acc_norm_stderr,none": 0.021407582047916447}, "piqa": {"alias": "piqa", "acc,none": 0.6784548422198041, "acc_stderr,none": 0.010897500107575652, "acc_norm,none": 0.6920565832426551, "acc_norm_stderr,none": 0.010770892367463682}, "race": {"alias": "race", "acc,none": 0.369377990430622, "acc_stderr,none": 0.014937221457864277}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41914022517911975, "acc_stderr,none": 0.011165140708170328}, "winogrande": {"alias": "winogrande", "acc,none": 0.6053670086819258, "acc_stderr,none": 0.013736915172371888}} {"created_at": "2025-04-25T20:33:29.003152", "global_step": 298000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3174061433447099, "acc_stderr,none": 0.01360223908803817, "acc_norm,none": 0.3677474402730375, "acc_norm_stderr,none": 0.014090995618168461}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6485690235690236, "acc_stderr,none": 0.009796395582817722, "acc_norm,none": 0.6203703703703703, "acc_norm_stderr,none": 0.009958037725468565}, "boolq": {"alias": "boolq", "acc,none": 0.5795107033639144, "acc_stderr,none": 0.00863377533246362}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2194922194922195, "acc_stderr,none": 0.011849997754533973}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.04512608598542127}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4146584345747859, "acc_stderr,none": 0.004916561213591277, "acc_norm,none": 0.5499900418243377, "acc_norm_stderr,none": 0.00496477980518065}, "mmlu": {"acc,none": 0.2536675687224042, "acc_stderr,none": 0.0036676679021518574, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2554729011689692, "acc_stderr,none": 0.006356640995951962, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.03567016675276863}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.035243908445117836}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.03283472056108567}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955924}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302872}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.033220157957767414}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.02353292543104429}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24804469273743016, "acc_stderr,none": 0.014444157808261452}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2958199356913183, "acc_stderr,none": 0.025922371788818788}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2654320987654321, "acc_stderr,none": 0.024569223600460845}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.23859191655801826, "acc_stderr,none": 0.010885929742002226}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.033014059469872514}, "mmlu_other": {"acc,none": 0.26746057289990344, "acc_stderr,none": 0.007929412501015674, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.25660377358490566, "acc_stderr,none": 0.026880647889051985}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.35874439461883406, "acc_stderr,none": 0.03219079200419995}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.04185832598928315}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.02920254015343117}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2796934865900383, "acc_stderr,none": 0.016050792148036546}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.02495418432487991}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.026244920349843007}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142314}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.0355092018568963}, "mmlu_social_sciences": {"acc,none": 0.2398440038999025, "acc_stderr,none": 0.007690407328544002, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893596}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23737373737373738, "acc_stderr,none": 0.030313710538198906}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.17098445595854922, "acc_stderr,none": 0.027171213683164542}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.021444547301560486}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279472}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24036697247706423, "acc_stderr,none": 0.01832060732096407}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596918}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.01795244919698787}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.33636363636363636, "acc_stderr,none": 0.04525393596302506}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.025206963154225378}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916704}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_stem": {"acc,none": 0.250872185220425, "acc_stderr,none": 0.007710094719451751, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04072314811876837}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.24342105263157895, "acc_stderr,none": 0.034923496688842384}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.035146974678623884}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179964}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847415}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3148936170212766, "acc_stderr,none": 0.030363582197238174}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.03455930201924812}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2709677419354839, "acc_stderr,none": 0.025284416114900156}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.0316185633535861}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.026962424325073828}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.02541642838876747}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467763}, "mmlu_pro": {"exact_match,custom-extract": 0.1102061170212766, "exact_match_stderr,custom-extract": 0.0028436875168662665, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.14504881450488144, "exact_match_stderr,custom-extract": 0.013160465168737609}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10012674271229405, "exact_match_stderr,custom-extract": 0.010693074879962119}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05918727915194346, "exact_match_stderr,custom-extract": 0.007016725322511607}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08780487804878048, "exact_match_stderr,custom-extract": 0.013993989404782777}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1552132701421801, "exact_match_stderr,custom-extract": 0.012471657591396918}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10319917440660474, "exact_match_stderr,custom-extract": 0.009777963967387048}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13691931540342298, "exact_match_stderr,custom-extract": 0.012026715288380386}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11286089238845144, "exact_match_stderr,custom-extract": 0.0162321409034614}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.0971843778383288, "exact_match_stderr,custom-extract": 0.008931027353227363}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07920059215396003, "exact_match_stderr,custom-extract": 0.00734987318391094}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1277056277056277, "exact_match_stderr,custom-extract": 0.010985901551102234}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13627254509018036, "exact_match_stderr,custom-extract": 0.015373681322287381}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09853733641262509, "exact_match_stderr,custom-extract": 0.0082725030329023}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15789473684210525, "exact_match_stderr,custom-extract": 0.012916292072532719}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.244, "acc_stderr,none": 0.01922673489361459, "acc_norm,none": 0.378, "acc_norm_stderr,none": 0.021706550824518184}, "piqa": {"alias": "piqa", "acc,none": 0.6887921653971708, "acc_stderr,none": 0.010802263878045839, "acc_norm,none": 0.6844396082698585, "acc_norm_stderr,none": 0.010843119201758927}, "race": {"alias": "race", "acc,none": 0.3492822966507177, "acc_stderr,none": 0.014754834713104495}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43346980552712383, "acc_stderr,none": 0.011213465070419027}, "winogrande": {"alias": "winogrande", "acc,none": 0.6195737963693765, "acc_stderr,none": 0.013644727908656831}} {"created_at": "2025-04-25T22:26:34.587397", "global_step": 300000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3242320819112628, "acc_stderr,none": 0.01367881039951882, "acc_norm,none": 0.3720136518771331, "acc_norm_stderr,none": 0.014124597881844461}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6393097643097643, "acc_stderr,none": 0.009853512108416737, "acc_norm,none": 0.5968013468013468, "acc_norm_stderr,none": 0.010065668576794789}, "boolq": {"alias": "boolq", "acc,none": 0.5327217125382263, "acc_stderr,none": 0.008726308038444397}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20147420147420148, "acc_stderr,none": 0.011483500195202903}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41435968930491934, "acc_stderr,none": 0.0049160438384556835, "acc_norm,none": 0.5507866958773153, "acc_norm_stderr,none": 0.004963974504003023}, "mmlu": {"acc,none": 0.2766699900299103, "acc_stderr,none": 0.003757654087641729, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25759829968119025, "acc_stderr,none": 0.006375146887152749, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30952380952380953, "acc_stderr,none": 0.04134913018303316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.03401506715249039}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693254}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22784810126582278, "acc_stderr,none": 0.02730348459906942}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.19008264462809918, "acc_stderr,none": 0.035817969517092825}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.0395783547198098}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.03192193448934725}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.02353292543104429}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.27262569832402234, "acc_stderr,none": 0.01489339173524962}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.26366559485530544, "acc_stderr,none": 0.02502553850053234}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.22530864197530864, "acc_stderr,none": 0.02324620264781975}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27053455019556716, "acc_stderr,none": 0.011345996743539255}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.032744852119469564}, "mmlu_other": {"acc,none": 0.271000965561635, "acc_stderr,none": 0.007912987752076158, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2679245283018868, "acc_stderr,none": 0.027257260322494845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.035331333893236574}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.16591928251121077, "acc_stderr,none": 0.024967553196547133}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23504273504273504, "acc_stderr,none": 0.027778835904935427}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24393358876117496, "acc_stderr,none": 0.015357212665829477}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3366013071895425, "acc_stderr,none": 0.027057974624494382}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.026244920349843007}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.41911764705882354, "acc_stderr,none": 0.02997280717046463}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.0317555478662992}, "mmlu_social_sciences": {"acc,none": 0.3002924926876828, "acc_stderr,none": 0.00822466307689436, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.041857744240220575}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070644}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.38341968911917096, "acc_stderr,none": 0.03508984236295342}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3435897435897436, "acc_stderr,none": 0.02407869658063548}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.33613445378151263, "acc_stderr,none": 0.030684737115135363}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3211009174311927, "acc_stderr,none": 0.020018149772733744}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.22712418300653595, "acc_stderr,none": 0.016949853279212387}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.041220665028782855}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.363265306122449, "acc_stderr,none": 0.030789051139030806}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2885572139303483, "acc_stderr,none": 0.03203841040213322}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_stem": {"acc,none": 0.287662543609261, "acc_stderr,none": 0.008020976868390386, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3355263157894737, "acc_stderr,none": 0.038424985593952694}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001975}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04690650298201942}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342343}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.036001056927277696}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.02278967314577656}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3064516129032258, "acc_stderr,none": 0.026226485652553873}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293752}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036622}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.026067159222275794}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.038020397601079024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4351851851851852, "acc_stderr,none": 0.03381200005643525}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "mmlu_pro": {"exact_match,custom-extract": 0.1040558510638298, "exact_match_stderr,custom-extract": 0.0027677895170289883, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1603905160390516, "exact_match_stderr,custom-extract": 0.01371423221942838}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.07604562737642585, "exact_match_stderr,custom-extract": 0.00944276708222895}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07332155477031801, "exact_match_stderr,custom-extract": 0.007750845159494197}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.0975609756097561, "exact_match_stderr,custom-extract": 0.01467186583433425}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14454976303317535, "exact_match_stderr,custom-extract": 0.012111342342350239}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08152734778121776, "exact_match_stderr,custom-extract": 0.008795227818605748}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.12836185819070906, "exact_match_stderr,custom-extract": 0.011702403876565497}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11286089238845144, "exact_match_stderr,custom-extract": 0.01623214090346143}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08719346049046321, "exact_match_stderr,custom-extract": 0.00850618817194349}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07623982235381199, "exact_match_stderr,custom-extract": 0.007222768107982048}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11147186147186147, "exact_match_stderr,custom-extract": 0.01035898893508243}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13226452905811623, "exact_match_stderr,custom-extract": 0.015181011139551248}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07236335642802155, "exact_match_stderr,custom-extract": 0.007191358722321214}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.17919799498746866, "exact_match_stderr,custom-extract": 0.013584905342855015}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.218, "acc_stderr,none": 0.018483378223178856, "acc_norm,none": 0.35, "acc_norm_stderr,none": 0.021352091786223108}, "piqa": {"alias": "piqa", "acc,none": 0.6789989118607181, "acc_stderr,none": 0.010892641574707903, "acc_norm,none": 0.6887921653971708, "acc_norm_stderr,none": 0.010802263878045839}, "race": {"alias": "race", "acc,none": 0.36076555023923446, "acc_stderr,none": 0.014862517074604979}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4227226202661208, "acc_stderr,none": 0.011178123214465782}, "winogrande": {"alias": "winogrande", "acc,none": 0.5824782951854776, "acc_stderr,none": 0.013859978264440248}} {"created_at": "2025-04-25T23:54:38.613503", "global_step": 302000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3438566552901024, "acc_stderr,none": 0.013880644570156215, "acc_norm,none": 0.37372013651877134, "acc_norm_stderr,none": 0.014137708601759086}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.672979797979798, "acc_stderr,none": 0.009626235849372203, "acc_norm,none": 0.6553030303030303, "acc_norm_stderr,none": 0.009752321586569777}, "boolq": {"alias": "boolq", "acc,none": 0.5259938837920489, "acc_stderr,none": 0.008733229228168134}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2203112203112203, "acc_stderr,none": 0.011865854943402445}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41674965146385184, "acc_stderr,none": 0.00492013073327178, "acc_norm,none": 0.5508862776339375, "acc_norm_stderr,none": 0.004963872936857944}, "mmlu": {"acc,none": 0.25672981056829514, "acc_stderr,none": 0.0036792589752479214, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2690754516471838, "acc_stderr,none": 0.006460092183851805, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.037649508797906045}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139404}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.03182231867647553}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.350210970464135, "acc_stderr,none": 0.031052391937584353}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2809917355371901, "acc_stderr,none": 0.04103203830514512}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3148148148148148, "acc_stderr,none": 0.04489931073591311}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2085889570552147, "acc_stderr,none": 0.03192193448934722}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.023786203255508287}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.01426555419233115}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2765273311897106, "acc_stderr,none": 0.025403832978179594}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25617283950617287, "acc_stderr,none": 0.0242885336377261}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2757496740547588, "acc_stderr,none": 0.01141381360916099}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.2661731573865465, "acc_stderr,none": 0.007910164907460515, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.24528301886792453, "acc_stderr,none": 0.02648035717989569}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3811659192825112, "acc_stderr,none": 0.032596251184168264}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646036}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3076923076923077, "acc_stderr,none": 0.030236389942173092}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.015671006009339586}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.024954184324879905}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21691176470588236, "acc_stderr,none": 0.025035845227711257}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064536}, "mmlu_social_sciences": {"acc,none": 0.24439389015274618, "acc_stderr,none": 0.007743602888481868, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.03646758875075566}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20707070707070707, "acc_stderr,none": 0.028869778460267066}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.03027690994517826}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2689075630252101, "acc_stderr,none": 0.028801392193631273}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24036697247706423, "acc_stderr,none": 0.01832060732096407}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.038073871163060866}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.017986615304030316}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.21224489795918366, "acc_stderr,none": 0.026176967197866767}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.26865671641791045, "acc_stderr,none": 0.03134328358208955}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_stem": {"acc,none": 0.24104027909927053, "acc_stderr,none": 0.007595313573368799, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.0391545063041425}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03459777606810536}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03476590104304134}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001975}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2, "acc_stderr,none": 0.0261488180184245}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03724563619774634}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.022019080012217914}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24516129032258063, "acc_stderr,none": 0.02447224384089552}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2315270935960591, "acc_stderr,none": 0.029678333141444455}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.024388430433987657}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2119205298013245, "acc_stderr,none": 0.03336767086567977}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355143}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.35714285714285715, "acc_stderr,none": 0.04547960999764376}, "mmlu_pro": {"exact_match,custom-extract": 0.11294880319148937, "exact_match_stderr,custom-extract": 0.0028789536377259294, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.15620641562064155, "exact_match_stderr,custom-extract": 0.013567849945745556}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11280101394169835, "exact_match_stderr,custom-extract": 0.011269480888070125}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07508833922261485, "exact_match_stderr,custom-extract": 0.007836192107843053}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11463414634146342, "exact_match_stderr,custom-extract": 0.015752762697429732}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1386255924170616, "exact_match_stderr,custom-extract": 0.011901560328343766}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09907120743034056, "exact_match_stderr,custom-extract": 0.009602432935115176}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1295843520782396, "exact_match_stderr,custom-extract": 0.01174974922385075}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.017173163625244695}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11807447774750227, "exact_match_stderr,custom-extract": 0.00972965922285408}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07920059215396003, "exact_match_stderr,custom-extract": 0.007349873183910942}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12445887445887446, "exact_match_stderr,custom-extract": 0.010865516089885906}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.10420841683366733, "exact_match_stderr,custom-extract": 0.013691159072055334}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1031562740569669, "exact_match_stderr,custom-extract": 0.008442457140721601}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15037593984962405, "exact_match_stderr,custom-extract": 0.012661157693234785}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.218, "acc_stderr,none": 0.018483378223178856, "acc_norm,none": 0.348, "acc_norm_stderr,none": 0.0213237286328075}, "piqa": {"alias": "piqa", "acc,none": 0.6887921653971708, "acc_stderr,none": 0.010802263878045839, "acc_norm,none": 0.6779107725788901, "acc_norm_stderr,none": 0.010902341695103427}, "race": {"alias": "race", "acc,none": 0.3674641148325359, "acc_stderr,none": 0.014921064308504981}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4293756397134084, "acc_stderr,none": 0.011200637273721812}, "winogrande": {"alias": "winogrande", "acc,none": 0.6156274664561957, "acc_stderr,none": 0.013671567600836196}} {"created_at": "2025-04-26T02:00:58.612596", "global_step": 304000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.37457337883959047, "acc_stderr,none": 0.014144193471893433, "acc_norm,none": 0.4249146757679181, "acc_norm_stderr,none": 0.014445698968520769}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6910774410774411, "acc_stderr,none": 0.009481048387761353, "acc_norm,none": 0.6654040404040404, "acc_norm_stderr,none": 0.009682137724327902}, "boolq": {"alias": "boolq", "acc,none": 0.6553516819571865, "acc_stderr,none": 0.008312235338398068}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21294021294021295, "acc_stderr,none": 0.011720679449797584}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.04725815626252607}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41256721768571997, "acc_stderr,none": 0.0049129004503708305, "acc_norm,none": 0.5477992431786497, "acc_norm_stderr,none": 0.004966928094797576}, "mmlu": {"acc,none": 0.2669135450790486, "acc_stderr,none": 0.0037299614397782025, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27523910733262485, "acc_stderr,none": 0.006506337902429808, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.039701582732351734}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.035243908445117836}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03068582059661081}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.041391127276354626}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3374233128834356, "acc_stderr,none": 0.03714908409935575}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28034682080924855, "acc_stderr,none": 0.024182427496577612}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2346368715083799, "acc_stderr,none": 0.014173044098303667}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3183279742765273, "acc_stderr,none": 0.02645722506781103}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.025630824975621344}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2685788787483703, "acc_stderr,none": 0.011320056629121718}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.034462962170884265}, "mmlu_other": {"acc,none": 0.2803347280334728, "acc_stderr,none": 0.008053836291448766, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.027495663683724057}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.03345036916788991}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.32038834951456313, "acc_stderr,none": 0.0462028408228004}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3162393162393162, "acc_stderr,none": 0.03046365674734024}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28607918263090676, "acc_stderr,none": 0.016160871405127553}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.025646863097137904}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.30141843971631205, "acc_stderr,none": 0.027374128882631146}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.024562204314142314}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25903614457831325, "acc_stderr,none": 0.03410646614071857}, "mmlu_social_sciences": {"acc,none": 0.24764380890477738, "acc_stderr,none": 0.0077779962823150564, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.037752050135836386}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02962022787479045}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.03027690994517825}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2, "acc_stderr,none": 0.020280805062535722}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176892}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25688073394495414, "acc_stderr,none": 0.01873249292834248}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.036412970813137296}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.018249024411207675}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2653061224489796, "acc_stderr,none": 0.0282638899437846}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.029705284056772436}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.260069774817634, "acc_stderr,none": 0.007814990703773235, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34074074074074073, "acc_stderr,none": 0.04094376269996794}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23026315789473684, "acc_stderr,none": 0.03426059424403165}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3125, "acc_stderr,none": 0.038760854559127644}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171452}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28936170212765955, "acc_stderr,none": 0.02964400657700962}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.03695183311650232}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24338624338624337, "acc_stderr,none": 0.022101128787415436}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24516129032258063, "acc_stderr,none": 0.024472243840895528}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2512315270935961, "acc_stderr,none": 0.030516530732694436}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.025787874220959323}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.20833333333333334, "acc_stderr,none": 0.02769691071309394}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.04327040932578728}, "mmlu_pro": {"exact_match,custom-extract": 0.11793550531914894, "exact_match_stderr,custom-extract": 0.0029263593362841883, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.15760111576011157, "exact_match_stderr,custom-extract": 0.013617018397714216}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11913814955640051, "exact_match_stderr,custom-extract": 0.01154027657147075}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06978798586572438, "exact_match_stderr,custom-extract": 0.007576175072607233}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12439024390243902, "exact_match_stderr,custom-extract": 0.0163187467101956}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14691943127962084, "exact_match_stderr,custom-extract": 0.012193288704573781}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09391124871001032, "exact_match_stderr,custom-extract": 0.009375760359013233}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.16992665036674817, "exact_match_stderr,custom-extract": 0.013139473257360399}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.06299212598425197, "exact_match_stderr,custom-extract": 0.012463010328276554}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11807447774750227, "exact_match_stderr,custom-extract": 0.00972965922285408}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08216136195410807, "exact_match_stderr,custom-extract": 0.007473948460954438}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.14935064935064934, "exact_match_stderr,custom-extract": 0.011732160468660274}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13627254509018036, "exact_match_stderr,custom-extract": 0.015373681322287381}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09776751347190146, "exact_match_stderr,custom-extract": 0.008243642868992002}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16290726817042606, "exact_match_stderr,custom-extract": 0.013080605724028934}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.226, "acc_stderr,none": 0.018722956449139915, "acc_norm,none": 0.362, "acc_norm_stderr,none": 0.0215136625275824}, "piqa": {"alias": "piqa", "acc,none": 0.6969532100108814, "acc_stderr,none": 0.010722648689531504, "acc_norm,none": 0.6947769314472253, "acc_norm_stderr,none": 0.01074426704560648}, "race": {"alias": "race", "acc,none": 0.3569377990430622, "acc_stderr,none": 0.014827656367408909}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4263050153531218, "acc_stderr,none": 0.01119050346326474}, "winogrande": {"alias": "winogrande", "acc,none": 0.6006314127861089, "acc_stderr,none": 0.013764933546717614}} {"created_at": "2025-04-26T03:55:08.544649", "global_step": 306000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.30802047781569963, "acc_stderr,none": 0.013491429517292038, "acc_norm,none": 0.3583617747440273, "acc_norm_stderr,none": 0.014012883334859864}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6237373737373737, "acc_stderr,none": 0.009940646221513789, "acc_norm,none": 0.5833333333333334, "acc_norm_stderr,none": 0.010116282977781246}, "boolq": {"alias": "boolq", "acc,none": 0.6519877675840978, "acc_stderr,none": 0.008331237559535395}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22113022113022113, "acc_stderr,none": 0.011881644696037884}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4129655447122087, "acc_stderr,none": 0.004913604782665852, "acc_norm,none": 0.5498904600677156, "acc_norm_stderr,none": 0.004964879563513316}, "mmlu": {"acc,none": 0.259792052414186, "acc_stderr,none": 0.0036974264181629797, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2731137088204038, "acc_stderr,none": 0.006486643175994122, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.15873015873015872, "acc_stderr,none": 0.03268454013011743}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.035243908445117836}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.031321798030832904}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2489451476793249, "acc_stderr,none": 0.028146970599422644}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.38016528925619836, "acc_stderr,none": 0.04431324501968432}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.0395783547198098}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.03602511318806771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28901734104046245, "acc_stderr,none": 0.024405173935783234}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.28938906752411575, "acc_stderr,none": 0.025755865922632935}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.025630824975621344}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.28292046936114734, "acc_stderr,none": 0.011503891323188976}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946957}, "mmlu_other": {"acc,none": 0.2507241712262633, "acc_stderr,none": 0.007768353103524334, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.025447863825108604}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.242152466367713, "acc_stderr,none": 0.028751392398694755}, "mmlu_management": {"alias": " - management", "acc,none": 0.23300970873786409, "acc_stderr,none": 0.041858325989283136}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2784163473818646, "acc_stderr,none": 0.016028295188992462}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.0248480182638752}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16544117647058823, "acc_stderr,none": 0.022571771025494767}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233134}, "mmlu_social_sciences": {"acc,none": 0.24764380890477738, "acc_stderr,none": 0.007782214104819188, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748142}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.26262626262626265, "acc_stderr,none": 0.031353050095330855}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2694300518134715, "acc_stderr,none": 0.03201867122877794}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2230769230769231, "acc_stderr,none": 0.02110773012724399}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279476}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23302752293577983, "acc_stderr,none": 0.018125669180861486}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.037276735755969195}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2875816993464052, "acc_stderr,none": 0.018311653053648222}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.038950910157241364}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2612244897959184, "acc_stderr,none": 0.028123429335142797}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_stem": {"acc,none": 0.26070409134157946, "acc_stderr,none": 0.007821445589521612, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04072314811876837}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137283}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653696}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2425531914893617, "acc_stderr,none": 0.028020226271200217}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.03695183311650232}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23548387096774193, "acc_stderr,none": 0.02413763242933771}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.031947400722655395}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02671924078371216}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.027920963147993666}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952687}, "mmlu_pro": {"exact_match,custom-extract": 0.12267287234042554, "exact_match_stderr,custom-extract": 0.002983864449439511, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.17154811715481172, "exact_match_stderr,custom-extract": 0.014088673719425009}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12801013941698353, "exact_match_stderr,custom-extract": 0.011901858811838483}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0706713780918728, "exact_match_stderr,custom-extract": 0.007620353777747214}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.10975609756097561, "exact_match_stderr,custom-extract": 0.015456358358757447}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14218009478672985, "exact_match_stderr,custom-extract": 0.012028283958485713}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.11351909184726522, "exact_match_stderr,custom-extract": 0.010196038549390659}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.15036674816625917, "exact_match_stderr,custom-extract": 0.012504911603573}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12598425196850394, "exact_match_stderr,custom-extract": 0.017022602638569532}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1080835603996367, "exact_match_stderr,custom-extract": 0.009361502616906136}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.11028867505551443, "exact_match_stderr,custom-extract": 0.008525564311072577}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12987012987012986, "exact_match_stderr,custom-extract": 0.011064857512116033}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.015561893867712506}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11162432640492687, "exact_match_stderr,custom-extract": 0.008740583141383705}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15413533834586465, "exact_match_stderr,custom-extract": 0.012790054353381214}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.24, "acc_stderr,none": 0.019118866653759753, "acc_norm,none": 0.352, "acc_norm_stderr,none": 0.021380042385946048}, "piqa": {"alias": "piqa", "acc,none": 0.6887921653971708, "acc_stderr,none": 0.010802263878045839, "acc_norm,none": 0.6958650707290533, "acc_norm_stderr,none": 0.010733493335721314}, "race": {"alias": "race", "acc,none": 0.3521531100478469, "acc_stderr,none": 0.014782629897202254}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4165813715455476, "acc_stderr,none": 0.011155497599417943}, "winogrande": {"alias": "winogrande", "acc,none": 0.6211523283346487, "acc_stderr,none": 0.01363372460318033}} {"created_at": "2025-04-26T05:20:25.137083", "global_step": 308000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3199658703071672, "acc_stderr,none": 0.013631345807016196, "acc_norm,none": 0.371160409556314, "acc_norm_stderr,none": 0.014117971901142818}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6536195286195287, "acc_stderr,none": 0.009763542075695736, "acc_norm,none": 0.6233164983164983, "acc_norm_stderr,none": 0.009942848077476169}, "boolq": {"alias": "boolq", "acc,none": 0.6776758409785932, "acc_stderr,none": 0.008174288670486751}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.23013923013923013, "acc_stderr,none": 0.012050956185794132}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4171479784903406, "acc_stderr,none": 0.00492080031323275, "acc_norm,none": 0.5523800039832703, "acc_norm_stderr,none": 0.004962325297840991}, "mmlu": {"acc,none": 0.24989317761002705, "acc_stderr,none": 0.0036484614235918244, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24760892667375134, "acc_stderr,none": 0.0062966840789765366, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.03567016675276862}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.23030303030303031, "acc_stderr,none": 0.03287666758603488}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.23039215686274508, "acc_stderr,none": 0.029554292605695053}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24472573839662448, "acc_stderr,none": 0.02798569938703642}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516304}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.02317629820399201}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808848}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2733118971061093, "acc_stderr,none": 0.025311765975426115}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25617283950617287, "acc_stderr,none": 0.0242885336377261}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2529335071707953, "acc_stderr,none": 0.011102268713839989}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.19883040935672514, "acc_stderr,none": 0.03061111655743253}, "mmlu_other": {"acc,none": 0.26874798841326036, "acc_stderr,none": 0.007930780224923968, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2679245283018868, "acc_stderr,none": 0.027257260322494845}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3811659192825112, "acc_stderr,none": 0.03259625118416828}, "mmlu_management": {"alias": " - management", "acc,none": 0.24271844660194175, "acc_stderr,none": 0.04245022486384495}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.028605953702004243}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28607918263090676, "acc_stderr,none": 0.016160871405127532}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912255}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.0258921511567094}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20220588235294118, "acc_stderr,none": 0.024398192986654924}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.0362933532994786}, "mmlu_social_sciences": {"acc,none": 0.24016899577510561, "acc_stderr,none": 0.007698602773364383, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.029126522834586825}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.02951928261681725}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.21794871794871795, "acc_stderr,none": 0.020932445774463185}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25210084033613445, "acc_stderr,none": 0.02820554503327772}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23302752293577983, "acc_stderr,none": 0.01812566918086149}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.017740899509177795}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.34545454545454546, "acc_stderr,none": 0.04554619617541054}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.21224489795918366, "acc_stderr,none": 0.026176967197866767}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916704}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_stem": {"acc,none": 0.24421186171899778, "acc_stderr,none": 0.007629314377101087, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.03633384414073465}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03476590104304134}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179964}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.32340425531914896, "acc_stderr,none": 0.030579442773610334}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.03455930201924812}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2645161290322581, "acc_stderr,none": 0.02509189237885928}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293753}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.027309140588230182}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1388888888888889, "acc_stderr,none": 0.02358544736890012}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.12857380319148937, "exact_match_stderr,custom-extract": 0.0030379478684145153, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.18131101813110181, "exact_match_stderr,custom-extract": 0.014398427367734229}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12420785804816223, "exact_match_stderr,custom-extract": 0.011749298825998147}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06978798586572438, "exact_match_stderr,custom-extract": 0.007576175072607243}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.14634146341463414, "exact_match_stderr,custom-extract": 0.017476889350508586}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14691943127962084, "exact_match_stderr,custom-extract": 0.012193288704573778}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.11867905056759546, "exact_match_stderr,custom-extract": 0.01039480627246693}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.17726161369193155, "exact_match_stderr,custom-extract": 0.013360638127229665}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12073490813648294, "exact_match_stderr,custom-extract": 0.01671415962068329}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10535876475930972, "exact_match_stderr,custom-extract": 0.00925685473030191}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10288675055514433, "exact_match_stderr,custom-extract": 0.008268685556131968}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1266233766233766, "exact_match_stderr,custom-extract": 0.010946036109831503}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11422845691382766, "exact_match_stderr,custom-extract": 0.014253888115016511}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12702078521939955, "exact_match_stderr,custom-extract": 0.009242766935988802}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.19548872180451127, "exact_match_stderr,custom-extract": 0.01404745861054571}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.26, "acc_stderr,none": 0.019635965529725512, "acc_norm,none": 0.38, "acc_norm_stderr,none": 0.02172888143870171}, "piqa": {"alias": "piqa", "acc,none": 0.6958650707290533, "acc_stderr,none": 0.01073349333572131, "acc_norm,none": 0.6996735582154516, "acc_norm_stderr,none": 0.010695225308183136}, "race": {"alias": "race", "acc,none": 0.36555023923444974, "acc_stderr,none": 0.01490465424718231}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4242579324462641, "acc_stderr,none": 0.011183502662341789}, "winogrande": {"alias": "winogrande", "acc,none": 0.6093133385951065, "acc_stderr,none": 0.013712536036556663}} {"created_at": "2025-04-26T07:20:10.711370", "global_step": 310000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3430034129692833, "acc_stderr,none": 0.01387242322371817, "acc_norm,none": 0.3856655290102389, "acc_norm_stderr,none": 0.014224250973257175}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.625, "acc_stderr,none": 0.009933992677987828, "acc_norm,none": 0.593013468013468, "acc_norm_stderr,none": 0.010080695355466596}, "boolq": {"alias": "boolq", "acc,none": 0.6755351681957187, "acc_stderr,none": 0.008188424271775841}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21457821457821458, "acc_stderr,none": 0.011753423094216842}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816505}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4183429595698068, "acc_stderr,none": 0.004922789247319877, "acc_norm,none": 0.555964947221669, "acc_norm_stderr,none": 0.004958426152481887}, "mmlu": {"acc,none": 0.2507477567298106, "acc_stderr,none": 0.0036533264600830512, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25228480340063764, "acc_stderr,none": 0.006334726883295716, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.035670166752768614}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139406}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.03166009679399811}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.25738396624472576, "acc_stderr,none": 0.028458820991460295}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302872}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3148148148148148, "acc_stderr,none": 0.04489931073591312}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.02298959254312357}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2604501607717042, "acc_stderr,none": 0.024926723224845546}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2654320987654321, "acc_stderr,none": 0.024569223600460842}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25749674054758803, "acc_stderr,none": 0.011167706014904149}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2046783625730994, "acc_stderr,none": 0.03094445977853321}, "mmlu_other": {"acc,none": 0.2706791116832958, "acc_stderr,none": 0.007956369284040357, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27169811320754716, "acc_stderr,none": 0.027377706624670713}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3542600896860987, "acc_stderr,none": 0.03210062154134985}, "mmlu_management": {"alias": " - management", "acc,none": 0.2524271844660194, "acc_stderr,none": 0.04301250399690878}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.029614323690456648}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28735632183908044, "acc_stderr,none": 0.0161824107306827}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.024170840879341016}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.02601199293090201}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19852941176470587, "acc_stderr,none": 0.024231013370541104}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.0362933532994786}, "mmlu_social_sciences": {"acc,none": 0.2365940851478713, "acc_stderr,none": 0.007650957161017274, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159395}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.029126522834586836}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803627}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2230769230769231, "acc_stderr,none": 0.02110773012724399}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.027553614467863797}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23669724770642203, "acc_stderr,none": 0.018224078117299085}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596918}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.01777694715752805}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.35454545454545455, "acc_stderr,none": 0.04582004841505416}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.17142857142857143, "acc_stderr,none": 0.024127463462650146}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.030147775935409214}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_stem": {"acc,none": 0.24262607040913417, "acc_stderr,none": 0.0076250486045175555, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21710526315789475, "acc_stderr,none": 0.03355045304882925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.035146974678623884}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.15, "acc_stderr,none": 0.03588702812826371}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179964}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.32340425531914896, "acc_stderr,none": 0.030579442773610334}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.02226181769240019}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.26129032258064516, "acc_stderr,none": 0.024993053397764822}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293753}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2185430463576159, "acc_stderr,none": 0.03374235550425694}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355178}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.04327040932578728}, "mmlu_pro": {"exact_match,custom-extract": 0.12840757978723405, "exact_match_stderr,custom-extract": 0.0030361677995645966, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.16736401673640167, "exact_match_stderr,custom-extract": 0.013950896661189982}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10139416983523447, "exact_match_stderr,custom-extract": 0.010752959229023348}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0892226148409894, "exact_match_stderr,custom-extract": 0.008476416539386432}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.14390243902439023, "exact_match_stderr,custom-extract": 0.017355377052481254}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.16113744075829384, "exact_match_stderr,custom-extract": 0.01266280296058872}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1042311661506708, "exact_match_stderr,custom-extract": 0.009821076496344337}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.17237163814180928, "exact_match_stderr,custom-extract": 0.013214159929836846}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11023622047244094, "exact_match_stderr,custom-extract": 0.016065998434778184}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11716621253405994, "exact_match_stderr,custom-extract": 0.00969715474552306}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0917838638045892, "exact_match_stderr,custom-extract": 0.007857979485361455}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.14935064935064934, "exact_match_stderr,custom-extract": 0.011732160468660269}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13226452905811623, "exact_match_stderr,custom-extract": 0.015181011139551248}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11316397228637413, "exact_match_stderr,custom-extract": 0.0087930270587401}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.20175438596491227, "exact_match_stderr,custom-extract": 0.01421512235347483}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.25, "acc_stderr,none": 0.019384310743640384, "acc_norm,none": 0.356, "acc_norm_stderr,none": 0.02143471235607264}, "piqa": {"alias": "piqa", "acc,none": 0.6887921653971708, "acc_stderr,none": 0.010802263878045839, "acc_norm,none": 0.6931447225244831, "acc_norm_stderr,none": 0.01076029507058038}, "race": {"alias": "race", "acc,none": 0.3492822966507177, "acc_stderr,none": 0.014754834713104499}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4232343909928352, "acc_stderr,none": 0.011179928646626208}, "winogrande": {"alias": "winogrande", "acc,none": 0.6195737963693765, "acc_stderr,none": 0.013644727908656831}} {"created_at": "2025-04-26T09:15:50.507365", "global_step": 312000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3293515358361775, "acc_stderr,none": 0.013734057652635473, "acc_norm,none": 0.3660409556313993, "acc_norm_stderr,none": 0.014077223108470142}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6401515151515151, "acc_stderr,none": 0.009848484848484837, "acc_norm,none": 0.6031144781144782, "acc_norm_stderr,none": 0.010039236800583202}, "boolq": {"alias": "boolq", "acc,none": 0.5617737003058104, "acc_stderr,none": 0.008678056241208772}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21457821457821458, "acc_stderr,none": 0.011753423094216843}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.045604802157206845}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4213304122684724, "acc_stderr,none": 0.00492763180647757, "acc_norm,none": 0.558653654650468, "acc_norm_stderr,none": 0.0049553302773042715}, "mmlu": {"acc,none": 0.2513174761429996, "acc_stderr,none": 0.003655411185753154, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24739638682252924, "acc_stderr,none": 0.006291762348865621, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.15079365079365079, "acc_stderr,none": 0.03200686497287394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.034277431758165236}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.03096451792692341}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.24793388429752067, "acc_stderr,none": 0.03941897526516304}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.04453197507374984}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.03351953879521269}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2604501607717042, "acc_stderr,none": 0.024926723224845543}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2654320987654321, "acc_stderr,none": 0.024569223600460845}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24315514993481094, "acc_stderr,none": 0.010956556654417355}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.031885780176863984}, "mmlu_other": {"acc,none": 0.27421950434502734, "acc_stderr,none": 0.007986485624069964, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2830188679245283, "acc_stderr,none": 0.0277242364927009}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3632286995515695, "acc_stderr,none": 0.032277904428505}, "mmlu_management": {"alias": " - management", "acc,none": 0.24271844660194175, "acc_stderr,none": 0.042450224863844956}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.029480360549541187}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.016328814422102055}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912255}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25886524822695034, "acc_stderr,none": 0.026129572527180848}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20955882352941177, "acc_stderr,none": 0.024723110407677062}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.035072954313705176}, "mmlu_social_sciences": {"acc,none": 0.23626909327266818, "acc_stderr,none": 0.007645060115177747, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022057}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.19696969696969696, "acc_stderr,none": 0.02833560973246335}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20725388601036268, "acc_stderr,none": 0.029252823291803627}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23076923076923078, "acc_stderr,none": 0.021362027725222717}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.027553614467863797}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23853211009174313, "acc_stderr,none": 0.01827257581023187}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.21374045801526717, "acc_stderr,none": 0.0359546161177469}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.017848089574913222}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04607582090719976}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.17142857142857143, "acc_stderr,none": 0.024127463462650146}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.22885572139303484, "acc_stderr,none": 0.029705284056772443}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_stem": {"acc,none": 0.24928639391056137, "acc_stderr,none": 0.007699033620337667, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066653}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23026315789473684, "acc_stderr,none": 0.034260594244031654}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2152777777777778, "acc_stderr,none": 0.03437079344106136}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036624}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.30638297872340425, "acc_stderr,none": 0.030135906478517563}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25132275132275134, "acc_stderr,none": 0.022340482339643898}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27419354838709675, "acc_stderr,none": 0.025378139970885196}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.03178529710642749}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.02564410863926763}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473836}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.17592592592592593, "acc_stderr,none": 0.025967420958258526}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291519}, "mmlu_pro": {"exact_match,custom-extract": 0.1141123670212766, "exact_match_stderr,custom-extract": 0.0028819288616251863, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1589958158995816, "exact_match_stderr,custom-extract": 0.013665811152278306}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10519645120405577, "exact_match_stderr,custom-extract": 0.010929524923270297}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06007067137809187, "exact_match_stderr,custom-extract": 0.007065575492862475}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13658536585365855, "exact_match_stderr,custom-extract": 0.01698048669306053}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.15402843601895735, "exact_match_stderr,custom-extract": 0.01243267405840755}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09494324045407637, "exact_match_stderr,custom-extract": 0.009421764715678245}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.15892420537897312, "exact_match_stderr,custom-extract": 0.012790913539867837}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11548556430446194, "exact_match_stderr,custom-extract": 0.016395494305781095}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.08628519527702089, "exact_match_stderr,custom-extract": 0.008465977919833948}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07105847520355292, "exact_match_stderr,custom-extract": 0.0069925446173869695}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12445887445887446, "exact_match_stderr,custom-extract": 0.010865516089885898}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11823647294589178, "exact_match_stderr,custom-extract": 0.014468953704661747}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11316397228637413, "exact_match_stderr,custom-extract": 0.008793027058740103}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.18045112781954886, "exact_match_stderr,custom-extract": 0.013621911931802993}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.236, "acc_stderr,none": 0.01900869962208472, "acc_norm,none": 0.362, "acc_norm_stderr,none": 0.021513662527582404}, "piqa": {"alias": "piqa", "acc,none": 0.6877040261153428, "acc_stderr,none": 0.010812581599154424, "acc_norm,none": 0.6893362350380848, "acc_norm_stderr,none": 0.01079707893372767}, "race": {"alias": "race", "acc,none": 0.35789473684210527, "acc_stderr,none": 0.014836467904073733}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4278403275332651, "acc_stderr,none": 0.01119562541819821}, "winogrande": {"alias": "winogrande", "acc,none": 0.6148382004735596, "acc_stderr,none": 0.013676821287521424}} {"created_at": "2025-04-26T11:15:33.417461", "global_step": 314000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.35580204778157, "acc_stderr,none": 0.013990571137918762, "acc_norm,none": 0.3771331058020478, "acc_norm_stderr,none": 0.014163366896192589}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6641414141414141, "acc_stderr,none": 0.009691180932083498, "acc_norm,none": 0.6262626262626263, "acc_norm_stderr,none": 0.00992726705825962}, "boolq": {"alias": "boolq", "acc,none": 0.6984709480122324, "acc_stderr,none": 0.008026593966630262}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2022932022932023, "acc_stderr,none": 0.01150091452526044}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.04512608598542127}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.420035849432384, "acc_stderr,none": 0.004925556104679425, "acc_norm,none": 0.5518820952001593, "acc_norm_stderr,none": 0.0049628462061255115}, "mmlu": {"acc,none": 0.26556046147272466, "acc_stderr,none": 0.0037220366073932327, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2777895855472901, "acc_stderr,none": 0.006519438466843596, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.03333333333333336}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.03453131801885416}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3284313725490196, "acc_stderr,none": 0.03296245110172228}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3037974683544304, "acc_stderr,none": 0.029936696387138608}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.38016528925619836, "acc_stderr,none": 0.04431324501968432}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252627}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2883435582822086, "acc_stderr,none": 0.035590395316173425}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2947976878612717, "acc_stderr,none": 0.02454761779480383}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2958199356913183, "acc_stderr,none": 0.025922371788818777}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.30246913580246915, "acc_stderr,none": 0.025557653981868045}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27053455019556716, "acc_stderr,none": 0.011345996743539252}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03615507630310935}, "mmlu_other": {"acc,none": 0.25587383327969104, "acc_stderr,none": 0.007813010800245272, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.026616482980501715}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.03391750322321659}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2645739910313901, "acc_stderr,none": 0.029605103217038343}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822584}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623102}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2848020434227331, "acc_stderr,none": 0.016139174096522567}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.025646863097137894}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2801418439716312, "acc_stderr,none": 0.026789172351140242}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16911764705882354, "acc_stderr,none": 0.022770868010113025}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.19879518072289157, "acc_stderr,none": 0.031069390260789406}, "mmlu_social_sciences": {"acc,none": 0.26064348391290215, "acc_stderr,none": 0.007913675579809515, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281336}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2828282828282828, "acc_stderr,none": 0.03208779558786753}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.26424870466321243, "acc_stderr,none": 0.031821550509166484}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2128205128205128, "acc_stderr,none": 0.020752423722128016}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2184873949579832, "acc_stderr,none": 0.026841514322958938}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26788990825688075, "acc_stderr,none": 0.018987462257978652}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2748091603053435, "acc_stderr,none": 0.039153454088478354}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.29248366013071897, "acc_stderr,none": 0.018403415710109783}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072773}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2693877551020408, "acc_stderr,none": 0.02840125202902294}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.03115715086935558}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.26165556612749763, "acc_stderr,none": 0.00782333565937912, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34074074074074073, "acc_stderr,none": 0.04094376269996794}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.03738520676119669}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653697}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.040925639582376556}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2170212765957447, "acc_stderr,none": 0.026947483121496228}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.037245636197746325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24838709677419354, "acc_stderr,none": 0.024580028921481}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03144712581678242}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.02696242432507383}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.20833333333333334, "acc_stderr,none": 0.02769691071309394}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "mmlu_pro": {"exact_match,custom-extract": 0.12491688829787234, "exact_match_stderr,custom-extract": 0.00299550436392025, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.18828451882845187, "exact_match_stderr,custom-extract": 0.014610083894141207}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09632446134347275, "exact_match_stderr,custom-extract": 0.010510211344233732}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07773851590106007, "exact_match_stderr,custom-extract": 0.007961847521542128}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.16585365853658537, "exact_match_stderr,custom-extract": 0.018391705269105992}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17654028436018956, "exact_match_stderr,custom-extract": 0.013131952481957442}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1042311661506708, "exact_match_stderr,custom-extract": 0.00982107649634431}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.15036674816625917, "exact_match_stderr,custom-extract": 0.012504911603573}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.017173163625244667}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11807447774750227, "exact_match_stderr,custom-extract": 0.00972965922285408}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08216136195410807, "exact_match_stderr,custom-extract": 0.0074739484609544354}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12445887445887446, "exact_match_stderr,custom-extract": 0.010865516089885908}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13226452905811623, "exact_match_stderr,custom-extract": 0.015181011139551248}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10161662817551963, "exact_match_stderr,custom-extract": 0.00838640624753275}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.20050125313283207, "exact_match_stderr,custom-extract": 0.014182026045901096}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.21, "acc_stderr,none": 0.018233620865305916, "acc_norm,none": 0.34, "acc_norm_stderr,none": 0.021206117013673066}, "piqa": {"alias": "piqa", "acc,none": 0.6789989118607181, "acc_stderr,none": 0.010892641574707903, "acc_norm,none": 0.6844396082698585, "acc_norm_stderr,none": 0.010843119201758926}, "race": {"alias": "race", "acc,none": 0.3550239234449761, "acc_stderr,none": 0.014809839887617086}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4211873080859775, "acc_stderr,none": 0.011172633149198374}, "winogrande": {"alias": "winogrande", "acc,none": 0.6219415943172849, "acc_stderr,none": 0.013628165460523232}} {"created_at": "2025-04-26T12:33:47.519471", "global_step": 316000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3395904436860068, "acc_stderr,none": 0.013839039762820167, "acc_norm,none": 0.3728668941979522, "acc_norm_stderr,none": 0.01413117676013117}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6624579124579124, "acc_stderr,none": 0.0097031178207903, "acc_norm,none": 0.6338383838383839, "acc_norm_stderr,none": 0.009885391390947726}, "boolq": {"alias": "boolq", "acc,none": 0.6605504587155964, "acc_stderr,none": 0.008281960446071346}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22604422604422605, "acc_stderr,none": 0.01197498190957562}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.0446196043338474}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4171479784903406, "acc_stderr,none": 0.004920800313232749, "acc_norm,none": 0.5555666201951802, "acc_norm_stderr,none": 0.0049588722884421465}, "mmlu": {"acc,none": 0.2649195271328871, "acc_stderr,none": 0.0037193830864187104, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2731137088204038, "acc_stderr,none": 0.006489141270769907, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.034550710191021475}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2787878787878788, "acc_stderr,none": 0.03501438706296781}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.03228210387037894}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955924}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4049586776859504, "acc_stderr,none": 0.044811377559424694}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252628}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2822085889570552, "acc_stderr,none": 0.03536117886664743}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.023445826276545543}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2536312849162011, "acc_stderr,none": 0.01455155365936992}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3022508038585209, "acc_stderr,none": 0.02608270069539966}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.29012345679012347, "acc_stderr,none": 0.025251173936495022}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26988265971316816, "acc_stderr,none": 0.011337381084250404}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.03488647713457921}, "mmlu_other": {"acc,none": 0.25587383327969104, "acc_stderr,none": 0.007802958644871344, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.026199808807561925}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.034140140070440354}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001975}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2242152466367713, "acc_stderr,none": 0.02799153425851952}, "mmlu_management": {"alias": " - management", "acc,none": 0.20388349514563106, "acc_stderr,none": 0.03989139859531773}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.027236013946196704}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28735632183908044, "acc_stderr,none": 0.0161824107306827}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.025457756696667878}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2801418439716312, "acc_stderr,none": 0.026789172351140242}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17279411764705882, "acc_stderr,none": 0.022966067585581756}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.03141784291663925}, "mmlu_social_sciences": {"acc,none": 0.25349366265843354, "acc_stderr,none": 0.007844896149885055, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.29797979797979796, "acc_stderr,none": 0.03258630383836556}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24870466321243523, "acc_stderr,none": 0.03119584087770029}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.22564102564102564, "acc_stderr,none": 0.02119363252514854}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.026653531596715498}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24403669724770644, "acc_stderr,none": 0.018415286351416402}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.017986615304030333}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.041220665028782834}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2653061224489796, "acc_stderr,none": 0.028263889943784606}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.030769444967296024}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_stem": {"acc,none": 0.272756105296543, "acc_stderr,none": 0.007929679811403969, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34074074074074073, "acc_stderr,none": 0.04094376269996794}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351586}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179962}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.24680851063829787, "acc_stderr,none": 0.0281854413012341}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.022261817692400175}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.024892469172462836}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.31527093596059114, "acc_stderr,none": 0.03269080871970186}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.02708037281514566}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.27314814814814814, "acc_stderr,none": 0.030388051301678116}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25, "acc_stderr,none": 0.04109974682633932}, "mmlu_pro": {"exact_match,custom-extract": 0.11843417553191489, "exact_match_stderr,custom-extract": 0.002934232810875435, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.16736401673640167, "exact_match_stderr,custom-extract": 0.013950896661189982}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10646387832699619, "exact_match_stderr,custom-extract": 0.010987378600044829}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06625441696113074, "exact_match_stderr,custom-extract": 0.0073958891964677346}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13414634146341464, "exact_match_stderr,custom-extract": 0.0168519441272791}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.16587677725118483, "exact_match_stderr,custom-extract": 0.012811326913919266}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08978328173374613, "exact_match_stderr,custom-extract": 0.0091882428049003}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14180929095354522, "exact_match_stderr,custom-extract": 0.012204871709898289}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10761154855643044, "exact_match_stderr,custom-extract": 0.01589697945272338}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10535876475930972, "exact_match_stderr,custom-extract": 0.009256854730301911}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.09474463360473723, "exact_match_stderr,custom-extract": 0.0079706911270507}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13203463203463203, "exact_match_stderr,custom-extract": 0.011142798517705926}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.01556189386771252}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10854503464203233, "exact_match_stderr,custom-extract": 0.008634105256024196}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16290726817042606, "exact_match_stderr,custom-extract": 0.013080605724028934}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.232, "acc_stderr,none": 0.01889619359195204, "acc_norm,none": 0.348, "acc_norm_stderr,none": 0.0213237286328075}, "piqa": {"alias": "piqa", "acc,none": 0.6866158868335147, "acc_stderr,none": 0.010822829929195485, "acc_norm,none": 0.6887921653971708, "acc_norm_stderr,none": 0.01080226387804584}, "race": {"alias": "race", "acc,none": 0.3492822966507177, "acc_stderr,none": 0.014754834713104492}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.42681678607983625, "acc_stderr,none": 0.011192223024107393}, "winogrande": {"alias": "winogrande", "acc,none": 0.6250986582478295, "acc_stderr,none": 0.013605544523788008}} {"created_at": "2025-04-26T14:58:03.410548", "global_step": 318000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3515358361774744, "acc_stderr,none": 0.013952413699600938, "acc_norm,none": 0.39334470989761094, "acc_norm_stderr,none": 0.014275101465693028}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6561447811447811, "acc_stderr,none": 0.00974666058485245, "acc_norm,none": 0.6279461279461279, "acc_norm_stderr,none": 0.009918187193096475}, "boolq": {"alias": "boolq", "acc,none": 0.6785932721712539, "acc_stderr,none": 0.00816817053141468}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.24651924651924653, "acc_stderr,none": 0.012339045968874144}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4222266480780721, "acc_stderr,none": 0.0049290484827604515, "acc_norm,none": 0.5599482174865564, "acc_norm_stderr,none": 0.0049537871465109365}, "mmlu": {"acc,none": 0.2652756017661302, "acc_stderr,none": 0.0037210076903391237, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2667375132837407, "acc_stderr,none": 0.006444268675061938, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.0361960452412425}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2787878787878788, "acc_stderr,none": 0.035014387062967806}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.033086111132364336}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.32489451476793246, "acc_stderr,none": 0.030486039389105303}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04065578140908705}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854933}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2331288343558282, "acc_stderr,none": 0.0332201579577674}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28034682080924855, "acc_stderr,none": 0.02418242749657761}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24115755627009647, "acc_stderr,none": 0.024296594034763426}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27444589308996087, "acc_stderr,none": 0.011397043163078154}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2573099415204678, "acc_stderr,none": 0.03352799844161865}, "mmlu_other": {"acc,none": 0.2790473125201159, "acc_stderr,none": 0.00802512679619812, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.28679245283018867, "acc_stderr,none": 0.027834912527544074}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.030952890217749884}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.30493273542600896, "acc_stderr,none": 0.030898610882477515}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.02974504857267406}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3167305236270754, "acc_stderr,none": 0.0166355664277125}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.024954184324879905}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.29432624113475175, "acc_stderr,none": 0.027187127011503807}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.02315746830855935}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.034843315926805875}, "mmlu_social_sciences": {"acc,none": 0.2642183945401365, "acc_stderr,none": 0.007947153090755192, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748139}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23737373737373738, "acc_stderr,none": 0.030313710538198913}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22279792746113988, "acc_stderr,none": 0.03003114797764154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23846153846153847, "acc_stderr,none": 0.02160629449464773}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24369747899159663, "acc_stderr,none": 0.027886828078380575}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26788990825688075, "acc_stderr,none": 0.018987462257978652}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728744}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28104575163398693, "acc_stderr,none": 0.018185218954318082}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644286}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2530612244897959, "acc_stderr,none": 0.027833023871399677}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.32338308457711445, "acc_stderr,none": 0.03307615947979033}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_stem": {"acc,none": 0.2505550269584523, "acc_stderr,none": 0.007710667941817647, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.0391545063041425}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.037161774375660185}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.03708284662416545}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.02937917046412482}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2328042328042328, "acc_stderr,none": 0.021765961672154537}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2645161290322581, "acc_stderr,none": 0.025091892378859275}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.21674876847290642, "acc_stderr,none": 0.02899033125251624}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.024556172219141286}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2185430463576159, "acc_stderr,none": 0.03374235550425694}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.029157522184605607}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340455}, "mmlu_pro": {"exact_match,custom-extract": 0.11735372340425532, "exact_match_stderr,custom-extract": 0.0029216636568581886, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.16736401673640167, "exact_match_stderr,custom-extract": 0.013950896661189982}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11280101394169835, "exact_match_stderr,custom-extract": 0.01126948088807011}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0812720848056537, "exact_match_stderr,custom-extract": 0.008125177440053668}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.08536585365853659, "exact_match_stderr,custom-extract": 0.013816694190586969}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1552132701421801, "exact_match_stderr,custom-extract": 0.01247165759139694}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10835913312693499, "exact_match_stderr,custom-extract": 0.00999056535282795}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.15647921760391198, "exact_match_stderr,custom-extract": 0.012710575019806375}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12335958005249344, "exact_match_stderr,custom-extract": 0.016869623436798514}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1008174386920981, "exact_match_stderr,custom-extract": 0.00907810967245658}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07327905255366396, "exact_match_stderr,custom-extract": 0.007092470342788471}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11580086580086581, "exact_match_stderr,custom-extract": 0.01053246671607755}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13827655310621242, "exact_match_stderr,custom-extract": 0.015468334539576873}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10854503464203233, "exact_match_stderr,custom-extract": 0.008634105256024198}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.17293233082706766, "exact_match_stderr,custom-extract": 0.013396133254614468}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.236, "acc_stderr,none": 0.019008699622084724, "acc_norm,none": 0.376, "acc_norm_stderr,none": 0.02168382753928612}, "piqa": {"alias": "piqa", "acc,none": 0.6751904243743199, "acc_stderr,none": 0.010926296238294032, "acc_norm,none": 0.6877040261153428, "acc_norm_stderr,none": 0.010812581599154424}, "race": {"alias": "race", "acc,none": 0.36555023923444974, "acc_stderr,none": 0.01490465424718231}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.428863868986694, "acc_stderr,none": 0.011198978879290182}, "winogrande": {"alias": "winogrande", "acc,none": 0.6069455406471981, "acc_stderr,none": 0.013727276249108447}} {"created_at": "2025-04-26T16:31:01.120108", "global_step": 320000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3430034129692833, "acc_stderr,none": 0.013872423223718166, "acc_norm,none": 0.39334470989761094, "acc_norm_stderr,none": 0.014275101465693026}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6536195286195287, "acc_stderr,none": 0.009763542075695741, "acc_norm,none": 0.627104377104377, "acc_norm_stderr,none": 0.00992274319712925}, "boolq": {"alias": "boolq", "acc,none": 0.5574923547400612, "acc_stderr,none": 0.008687051315181382}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20884520884520885, "acc_stderr,none": 0.011637590576063048}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.0446196043338474}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4191396136227843, "acc_stderr,none": 0.004924098711864586, "acc_norm,none": 0.5565624377614021, "acc_norm_stderr,none": 0.00495775089715293}, "mmlu": {"acc,none": 0.2564449508617006, "acc_stderr,none": 0.003680280141375993, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2724760892667375, "acc_stderr,none": 0.006486023795972355, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.15079365079365079, "acc_stderr,none": 0.03200686497287394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2787878787878788, "acc_stderr,none": 0.03501438706296781}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.032566854844603886}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.34710743801652894, "acc_stderr,none": 0.04345724570292534}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.03602511318806771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2947976878612717, "acc_stderr,none": 0.024547617794803835}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2861736334405145, "acc_stderr,none": 0.02567025924218894}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.025171041915309684}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26988265971316816, "acc_stderr,none": 0.011337381084250402}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.034886477134579215}, "mmlu_other": {"acc,none": 0.24074670099774703, "acc_stderr,none": 0.007655035824255352, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.02544786382510861}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483099}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.20179372197309417, "acc_stderr,none": 0.02693611191280227}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822585}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2720306513409962, "acc_stderr,none": 0.015913367447500517}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.0248480182638752}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16544117647058823, "acc_stderr,none": 0.022571771025494767}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.031417842916639245}, "mmlu_social_sciences": {"acc,none": 0.24179395515112123, "acc_stderr,none": 0.007720744998834149, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748142}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365907}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23316062176165803, "acc_stderr,none": 0.03051611137147601}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2128205128205128, "acc_stderr,none": 0.020752423722128013}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471878}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22385321100917432, "acc_stderr,none": 0.01787121776779022}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2826797385620915, "acc_stderr,none": 0.01821726955205345}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24489795918367346, "acc_stderr,none": 0.027529637440174917}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916714}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_stem": {"acc,none": 0.2622898826514431, "acc_stderr,none": 0.007828713753554647, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04072314811876837}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.03715062154998904}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653696}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20425531914893616, "acc_stderr,none": 0.02635515841334943}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25161290322580643, "acc_stderr,none": 0.024685979286239956}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.027920963147993666}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952687}, "mmlu_pro": {"exact_match,custom-extract": 0.11610704787234043, "exact_match_stderr,custom-extract": 0.0029068651335604715, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.18549511854951187, "exact_match_stderr,custom-extract": 0.014526352452146698}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09759188846641319, "exact_match_stderr,custom-extract": 0.010571710152486615}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0627208480565371, "exact_match_stderr,custom-extract": 0.007209566250015035}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11951219512195121, "exact_match_stderr,custom-extract": 0.016040065235546762}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.15758293838862558, "exact_match_stderr,custom-extract": 0.012548863257674932}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10732714138286893, "exact_match_stderr,custom-extract": 0.009948629733788072}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1491442542787286, "exact_match_stderr,custom-extract": 0.012462931361948613}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10236220472440945, "exact_match_stderr,custom-extract": 0.015549935163883106}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09355131698455948, "exact_match_stderr,custom-extract": 0.008780115347917632}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07994078460399703, "exact_match_stderr,custom-extract": 0.0073811700146960025}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13203463203463203, "exact_match_stderr,custom-extract": 0.011142798517705924}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13226452905811623, "exact_match_stderr,custom-extract": 0.015181011139551248}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1170130869899923, "exact_match_stderr,custom-extract": 0.008921892900609298}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14786967418546365, "exact_match_stderr,custom-extract": 0.012573709084942269}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.272, "acc_stderr,none": 0.019920483209566072, "acc_norm,none": 0.362, "acc_norm_stderr,none": 0.0215136625275824}, "piqa": {"alias": "piqa", "acc,none": 0.6833514689880305, "acc_stderr,none": 0.010853160531978484, "acc_norm,none": 0.6844396082698585, "acc_norm_stderr,none": 0.010843119201758927}, "race": {"alias": "race", "acc,none": 0.3444976076555024, "acc_stderr,none": 0.014707199932728215}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4257932446264074, "acc_stderr,none": 0.011188771652377858}, "winogrande": {"alias": "winogrande", "acc,none": 0.6022099447513812, "acc_stderr,none": 0.013755743513749025}} {"created_at": "2025-04-26T18:41:13.506820", "global_step": 322000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.34982935153583616, "acc_stderr,none": 0.013936809212158284, "acc_norm,none": 0.40187713310580203, "acc_norm_stderr,none": 0.014327268614578276}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6561447811447811, "acc_stderr,none": 0.009746660584852454, "acc_norm,none": 0.6519360269360269, "acc_norm_stderr,none": 0.009774627600259014}, "boolq": {"alias": "boolq", "acc,none": 0.5143730886850153, "acc_stderr,none": 0.008741441023424118}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22358722358722358, "acc_stderr,none": 0.011928612008761176}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.045604802157206845}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4202350129456284, "acc_stderr,none": 0.004925877705771195, "acc_norm,none": 0.5540728938458475, "acc_norm_stderr,none": 0.004960516570284905}, "mmlu": {"acc,none": 0.26855148839196696, "acc_stderr,none": 0.0037346723835739812, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.28437832093517534, "acc_stderr,none": 0.006572390610170884, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.19047619047619047, "acc_stderr,none": 0.03512207412302053}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2787878787878788, "acc_stderr,none": 0.035014387062967806}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.03182231867647553}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.32489451476793246, "acc_stderr,none": 0.030486039389105303}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.34710743801652894, "acc_stderr,none": 0.043457245702925335}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650743}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2883435582822086, "acc_stderr,none": 0.03559039531617342}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.025070713719153176}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25027932960893856, "acc_stderr,none": 0.014487500852850417}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3183279742765273, "acc_stderr,none": 0.026457225067811025}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.025407197798890155}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2816166883963494, "acc_stderr,none": 0.011487783272786696}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30409356725146197, "acc_stderr,none": 0.0352821125824523}, "mmlu_other": {"acc,none": 0.2574831026713872, "acc_stderr,none": 0.007826769150911296, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.02590789712240817}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.28699551569506726, "acc_stderr,none": 0.030360379710291947}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822585}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.028605953702004264}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28991060025542786, "acc_stderr,none": 0.016225017944770968}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.26143790849673204, "acc_stderr,none": 0.025160998214292456}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16544117647058823, "acc_stderr,none": 0.022571771025494767}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.031755547866299194}, "mmlu_social_sciences": {"acc,none": 0.2551186220344491, "acc_stderr,none": 0.00784829497096722, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.0383515395439942}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02962022787479048}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23316062176165803, "acc_stderr,none": 0.030516111371476008}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.02102067268082791}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.026265024608275886}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26605504587155965, "acc_stderr,none": 0.018946022322225597}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.018635594034423983}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072773}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2530612244897959, "acc_stderr,none": 0.027833023871399683}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3034825870646766, "acc_stderr,none": 0.03251006816458618}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_stem": {"acc,none": 0.2689502061528703, "acc_stderr,none": 0.007893195555966159, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34074074074074073, "acc_stderr,none": 0.04094376269996794}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.32894736842105265, "acc_stderr,none": 0.03823428969926605}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2297872340425532, "acc_stderr,none": 0.02750175294441242}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02256989707491841}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2645161290322581, "acc_stderr,none": 0.02509189237885928}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03144712581678242}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2175925925925926, "acc_stderr,none": 0.02813968944485967}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285714}, "mmlu_pro": {"exact_match,custom-extract": 0.11402925531914894, "exact_match_stderr,custom-extract": 0.002887091317987103, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.16457461645746166, "exact_match_stderr,custom-extract": 0.013857304110114627}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11026615969581749, "exact_match_stderr,custom-extract": 0.011158044019782565}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06448763250883392, "exact_match_stderr,custom-extract": 0.007303510883881919}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11463414634146342, "exact_match_stderr,custom-extract": 0.01575276269742975}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14218009478672985, "exact_match_stderr,custom-extract": 0.012028283958485741}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08668730650154799, "exact_match_stderr,custom-extract": 0.00904377653422992}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.13447432762836187, "exact_match_stderr,custom-extract": 0.011935720476846814}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.017173163625244695}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09809264305177112, "exact_match_stderr,custom-extract": 0.00896814952185027}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0851221317542561, "exact_match_stderr,custom-extract": 0.007595142426181168}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12229437229437229, "exact_match_stderr,custom-extract": 0.010783924213904816}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13627254509018036, "exact_match_stderr,custom-extract": 0.015373681322287381}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11393379522709776, "exact_match_stderr,custom-extract": 0.008819054413543085}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16541353383458646, "exact_match_stderr,custom-extract": 0.013161095126056708}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.264, "acc_stderr,none": 0.019732885585922094, "acc_norm,none": 0.382, "acc_norm_stderr,none": 0.021750820591250834}, "piqa": {"alias": "piqa", "acc,none": 0.6811751904243744, "acc_stderr,none": 0.010873037534333418, "acc_norm,none": 0.690968443960827, "acc_norm_stderr,none": 0.010781419464406979}, "race": {"alias": "race", "acc,none": 0.36076555023923446, "acc_stderr,none": 0.014862517074604975}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4227226202661208, "acc_stderr,none": 0.011178123214465777}, "winogrande": {"alias": "winogrande", "acc,none": 0.6006314127861089, "acc_stderr,none": 0.013764933546717614}} {"created_at": "2025-04-26T20:07:32.360670", "global_step": 324000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.34044368600682595, "acc_stderr,none": 0.013847460518892976, "acc_norm,none": 0.38993174061433444, "acc_norm_stderr,none": 0.014252959848892886}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6414141414141414, "acc_stderr,none": 0.009840882301225297, "acc_norm,none": 0.6203703703703703, "acc_norm_stderr,none": 0.009958037725468568}, "boolq": {"alias": "boolq", "acc,none": 0.6941896024464832, "acc_stderr,none": 0.008058565044259006}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2153972153972154, "acc_stderr,none": 0.01176969068622697}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4191396136227843, "acc_stderr,none": 0.004924098711864581, "acc_norm,none": 0.5581557458673571, "acc_norm_stderr,none": 0.004955914693717967}, "mmlu": {"acc,none": 0.2644922375729953, "acc_stderr,none": 0.003716367307776861, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27502656748140275, "acc_stderr,none": 0.006498272762727321, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03718489006818115}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.03401506715249039}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.03283472056108567}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.33755274261603374, "acc_stderr,none": 0.030781549102026212}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3140495867768595, "acc_stderr,none": 0.04236964753041019}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854933}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.03259177392742178}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.32947976878612717, "acc_stderr,none": 0.0253052581318797}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.01431099954796145}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3054662379421222, "acc_stderr,none": 0.026160584450140488}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.31790123456790126, "acc_stderr,none": 0.025910063528240886}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.258148631029987, "acc_stderr,none": 0.011176923719313402}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.034462962170884265}, "mmlu_other": {"acc,none": 0.2745413582233666, "acc_stderr,none": 0.007991750836280245, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899105}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.033450369167889904}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3273542600896861, "acc_stderr,none": 0.03149384670994131}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.044532548363264673}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3162393162393162, "acc_stderr,none": 0.030463656747340265}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.29757343550446996, "acc_stderr,none": 0.016349111912909428}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.02555316999182652}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340460997}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1801470588235294, "acc_stderr,none": 0.02334516361654486}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.26506024096385544, "acc_stderr,none": 0.03436024037944966}, "mmlu_social_sciences": {"acc,none": 0.2560935976600585, "acc_stderr,none": 0.00786979645614706, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.04049339297748141}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2474747474747475, "acc_stderr,none": 0.03074630074212452}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27979274611398963, "acc_stderr,none": 0.03239637046735703}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24358974358974358, "acc_stderr,none": 0.02176373368417392}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176892}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23302752293577983, "acc_stderr,none": 0.018125669180861483}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28594771241830064, "acc_stderr,none": 0.018280485072954683}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072773}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22857142857142856, "acc_stderr,none": 0.02688214492230774}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31343283582089554, "acc_stderr,none": 0.03280188205348642}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_stem": {"acc,none": 0.2470662860767523, "acc_stderr,none": 0.007674298318023586, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.039446241625011175}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.24342105263157895, "acc_stderr,none": 0.034923496688842384}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2013888888888889, "acc_stderr,none": 0.033536474697138406}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036625}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237654}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3148936170212766, "acc_stderr,none": 0.030363582197238174}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.03375672449560554}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948368}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23870967741935484, "acc_stderr,none": 0.024251071262208837}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.0316185633535861}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.02699145450203673}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.04157751539865629}, "mmlu_pro": {"exact_match,custom-extract": 0.11926529255319149, "exact_match_stderr,custom-extract": 0.0029432824043646093, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.17852161785216178, "exact_match_stderr,custom-extract": 0.014311559662047547}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09759188846641319, "exact_match_stderr,custom-extract": 0.010571710152486615}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07773851590106007, "exact_match_stderr,custom-extract": 0.007961847521542132}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12439024390243902, "exact_match_stderr,custom-extract": 0.016318746710195602}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17061611374407584, "exact_match_stderr,custom-extract": 0.012956092265003964}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1021671826625387, "exact_match_stderr,custom-extract": 0.00973454748410986}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1393643031784841, "exact_match_stderr,custom-extract": 0.012116422904979626}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11286089238845144, "exact_match_stderr,custom-extract": 0.01623214090346143}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12079927338782924, "exact_match_stderr,custom-extract": 0.009826069635820892}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08586232420429311, "exact_match_stderr,custom-extract": 0.007625006884588444}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12445887445887446, "exact_match_stderr,custom-extract": 0.010865516089885912}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11623246492985972, "exact_match_stderr,custom-extract": 0.014362104240159237}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10546574287913779, "exact_match_stderr,custom-extract": 0.00852544094256091}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16541353383458646, "exact_match_stderr,custom-extract": 0.013161095126056708}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.242, "acc_stderr,none": 0.019173085678337174, "acc_norm,none": 0.384, "acc_norm_stderr,none": 0.021772369465547198}, "piqa": {"alias": "piqa", "acc,none": 0.6817192600652884, "acc_stderr,none": 0.010868093932082231, "acc_norm,none": 0.6920565832426551, "acc_norm_stderr,none": 0.01077089236746368}, "race": {"alias": "race", "acc,none": 0.3492822966507177, "acc_stderr,none": 0.014754834713104499}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.42221084953940635, "acc_stderr,none": 0.011176305491513957}, "winogrande": {"alias": "winogrande", "acc,none": 0.606156274664562, "acc_stderr,none": 0.013732114472668753}} {"created_at": "2025-04-26T22:01:02.756849", "global_step": 326000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.37542662116040953, "acc_stderr,none": 0.014150631435111726, "acc_norm,none": 0.4206484641638225, "acc_norm_stderr,none": 0.014426211252508406}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6797138047138047, "acc_stderr,none": 0.00957415266873942, "acc_norm,none": 0.6632996632996633, "acc_norm_stderr,none": 0.00969716659575247}, "boolq": {"alias": "boolq", "acc,none": 0.710091743119266, "acc_stderr,none": 0.007935605384295896}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22358722358722358, "acc_stderr,none": 0.011928612008761174}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.42182832105158335, "acc_stderr,none": 0.004928420903026552, "acc_norm,none": 0.5541724756024696, "acc_norm_stderr,none": 0.004960408362133244}, "mmlu": {"acc,none": 0.2455490670844609, "acc_stderr,none": 0.003623425796449959, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25759829968119025, "acc_stderr,none": 0.006365257645601278, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.03970158273235173}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.22424242424242424, "acc_stderr,none": 0.03256866661681102}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.03283472056108567}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.31223628691983124, "acc_stderr,none": 0.030165137867847008}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.040261875275912046}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2630057803468208, "acc_stderr,none": 0.023703099525258158}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480768}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023132376234543332}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26988265971316816, "acc_stderr,none": 0.01133738108425041}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.32748538011695905, "acc_stderr,none": 0.035993357714560276}, "mmlu_other": {"acc,none": 0.25426456388799484, "acc_stderr,none": 0.007783910983375226, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899098}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.21965317919075145, "acc_stderr,none": 0.031568093627031744}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036624}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.34080717488789236, "acc_stderr,none": 0.031811497470553604}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.039166677628225836}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3247863247863248, "acc_stderr,none": 0.030679022765498835}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24393358876117496, "acc_stderr,none": 0.015357212665829475}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.024630048979824782}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25886524822695034, "acc_stderr,none": 0.026129572527180848}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.23236919077023074, "acc_stderr,none": 0.007615586254528779, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18686868686868688, "acc_stderr,none": 0.02777253333421898}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.02951928261681725}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.22564102564102564, "acc_stderr,none": 0.021193632525148522}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.026653531596715487}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22018348623853212, "acc_stderr,none": 0.01776597865232756}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24673202614379086, "acc_stderr,none": 0.017440820367402503}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.040693063197213754}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22857142857142856, "acc_stderr,none": 0.02688214492230774}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.030965903123573026}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.23184268950206152, "acc_stderr,none": 0.00749048001264252, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.03633384414073464}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.251063829787234, "acc_stderr,none": 0.02834696377716246}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2482758620689655, "acc_stderr,none": 0.03600105692727771}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.02193587808118476}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.19032258064516128, "acc_stderr,none": 0.022331707611823078}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1625615763546798, "acc_stderr,none": 0.025960300064605576}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.024556172219141234}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.026991454502036733}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.29464285714285715, "acc_stderr,none": 0.043270409325787296}, "mmlu_pro": {"exact_match,custom-extract": 0.11976396276595745, "exact_match_stderr,custom-extract": 0.002938010564783704, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.1701534170153417, "exact_match_stderr,custom-extract": 0.01404309168782706}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12420785804816223, "exact_match_stderr,custom-extract": 0.01174929882599816}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05742049469964664, "exact_match_stderr,custom-extract": 0.006917690995369924}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13902439024390245, "exact_match_stderr,custom-extract": 0.017107213277311913}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17061611374407584, "exact_match_stderr,custom-extract": 0.012956092265003964}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08875128998968008, "exact_match_stderr,custom-extract": 0.009140461457488615}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.15892420537897312, "exact_match_stderr,custom-extract": 0.012790913539867835}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11023622047244094, "exact_match_stderr,custom-extract": 0.016065998434778177}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09536784741144415, "exact_match_stderr,custom-extract": 0.008856062181125252}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.07031828275351591, "exact_match_stderr,custom-extract": 0.006958800549270519}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12445887445887446, "exact_match_stderr,custom-extract": 0.010865516089885905}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1342685370741483, "exact_match_stderr,custom-extract": 0.015277913884522439}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11855273287143957, "exact_match_stderr,custom-extract": 0.00897256486100922}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.20175438596491227, "exact_match_stderr,custom-extract": 0.014215122353474826}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.232, "acc_stderr,none": 0.01889619359195205, "acc_norm,none": 0.362, "acc_norm_stderr,none": 0.021513662527582404}, "piqa": {"alias": "piqa", "acc,none": 0.6887921653971708, "acc_stderr,none": 0.010802263878045839, "acc_norm,none": 0.6931447225244831, "acc_norm_stderr,none": 0.010760295070580383}, "race": {"alias": "race", "acc,none": 0.3521531100478469, "acc_stderr,none": 0.014782629897202255}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.458546571136131, "acc_stderr,none": 0.011275119608640895}, "winogrande": {"alias": "winogrande", "acc,none": 0.6093133385951065, "acc_stderr,none": 0.013712536036556672}} {"created_at": "2025-04-27T00:06:45.434206", "global_step": 328000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.37627986348122866, "acc_stderr,none": 0.01415702255540717, "acc_norm,none": 0.4035836177474403, "acc_norm_stderr,none": 0.014337158914268436}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6839225589225589, "acc_stderr,none": 0.009540440071928296, "acc_norm,none": 0.6544612794612794, "acc_norm_stderr,none": 0.009757948730670304}, "boolq": {"alias": "boolq", "acc,none": 0.537308868501529, "acc_stderr,none": 0.008720675606388454}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.01170420281420024}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.42093208524198367, "acc_stderr,none": 0.004926996830194227, "acc_norm,none": 0.5544712208723361, "acc_norm_stderr,none": 0.0049600825288524395}, "mmlu": {"acc,none": 0.29190998433271614, "acc_stderr,none": 0.003819992731182686, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27545164718384696, "acc_stderr,none": 0.006498716253311126, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3412698412698413, "acc_stderr,none": 0.04240799327574924}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.03663974994391242}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3431372549019608, "acc_stderr,none": 0.03332139944668086}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29535864978902954, "acc_stderr,none": 0.029696338713422886}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1487603305785124, "acc_stderr,none": 0.03248470083807195}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.03893542518824847}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.22832369942196531, "acc_stderr,none": 0.022598703804321614}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.26145251396648045, "acc_stderr,none": 0.014696599650364555}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24758842443729903, "acc_stderr,none": 0.024513879973621967}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02492200116888634}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.29726205997392435, "acc_stderr,none": 0.011673346173086022}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.25146198830409355, "acc_stderr,none": 0.033275044238468436}, "mmlu_other": {"acc,none": 0.30286449951721917, "acc_stderr,none": 0.008202967119066207, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2981132075471698, "acc_stderr,none": 0.028152837942493875}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.035331333893236574}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.273542600896861, "acc_stderr,none": 0.029918586707798834}, "mmlu_management": {"alias": " - management", "acc,none": 0.3883495145631068, "acc_stderr,none": 0.04825729337356389}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3034188034188034, "acc_stderr,none": 0.03011821010694265}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26947637292464877, "acc_stderr,none": 0.01586624307321506}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3366013071895425, "acc_stderr,none": 0.027057974624494382}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2907801418439716, "acc_stderr,none": 0.027090664368353178}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.43014705882352944, "acc_stderr,none": 0.030074971917302875}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.19879518072289157, "acc_stderr,none": 0.03106939026078943}, "mmlu_social_sciences": {"acc,none": 0.30776730581735456, "acc_stderr,none": 0.008306155299333658, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.0409698513984367}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.35353535353535354, "acc_stderr,none": 0.03406086723547153}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.3471502590673575, "acc_stderr,none": 0.03435696168361355}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3384615384615385, "acc_stderr,none": 0.023991500500313036}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3403361344537815, "acc_stderr,none": 0.030778057422931673}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3192660550458716, "acc_stderr,none": 0.01998782906975001}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.29770992366412213, "acc_stderr,none": 0.04010358942462203}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.017282760695167435}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644286}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.34285714285714286, "acc_stderr,none": 0.030387262919547724}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.30845771144278605, "acc_stderr,none": 0.03265819588512697}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.29019980970504283, "acc_stderr,none": 0.00802108225020851, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.03547854198560825}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.03738520676119668}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2986111111111111, "acc_stderr,none": 0.03827052357950756}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.39215686274509803, "acc_stderr,none": 0.04858083574266346}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.028504856470514192}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.02271746789770862}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.33225806451612905, "acc_stderr,none": 0.026795560848122794}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.26108374384236455, "acc_stderr,none": 0.030903796952114475}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.3973509933774834, "acc_stderr,none": 0.039955240076816806}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4212962962962963, "acc_stderr,none": 0.03367462138896078}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.1875, "acc_stderr,none": 0.0370468111477387}, "mmlu_pro": {"exact_match,custom-extract": 0.12599734042553193, "exact_match_stderr,custom-extract": 0.0030146775987073262, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.18410041841004185, "exact_match_stderr,custom-extract": 0.014484023868317836}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09759188846641319, "exact_match_stderr,custom-extract": 0.010571710152486613}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0803886925795053, "exact_match_stderr,custom-extract": 0.008084782328031906}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.16829268292682928, "exact_match_stderr,custom-extract": 0.01849933956398091}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.15758293838862558, "exact_match_stderr,custom-extract": 0.012548863257674923}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10526315789473684, "exact_match_stderr,custom-extract": 0.00986388905650164}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.16136919315403422, "exact_match_stderr,custom-extract": 0.012870182092450671}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13123359580052493, "exact_match_stderr,custom-extract": 0.01732136945584155}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11262488646684832, "exact_match_stderr,custom-extract": 0.009531789940422662}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10436713545521836, "exact_match_stderr,custom-extract": 0.008321085955307988}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12229437229437229, "exact_match_stderr,custom-extract": 0.010783924213904813}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14228456913827656, "exact_match_stderr,custom-extract": 0.0156543789197872}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1147036181678214, "exact_match_stderr,custom-extract": 0.008844953561387812}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16541353383458646, "exact_match_stderr,custom-extract": 0.013161095126056706}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.248, "acc_stderr,none": 0.019332342821239103, "acc_norm,none": 0.37, "acc_norm_stderr,none": 0.021613289165165788}, "piqa": {"alias": "piqa", "acc,none": 0.6893362350380848, "acc_stderr,none": 0.010797078933727673, "acc_norm,none": 0.7023939064200218, "acc_norm_stderr,none": 0.010667353792388213}, "race": {"alias": "race", "acc,none": 0.3674641148325359, "acc_stderr,none": 0.014921064308504987}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43654042988741043, "acc_stderr,none": 0.01122257442084478}, "winogrande": {"alias": "winogrande", "acc,none": 0.6306235201262825, "acc_stderr,none": 0.013564470596053526}} {"created_at": "2025-04-27T01:45:30.542128", "global_step": 330000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.371160409556314, "acc_stderr,none": 0.014117971901142813, "acc_norm,none": 0.41467576791808874, "acc_norm_stderr,none": 0.014397070564409172}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6957070707070707, "acc_stderr,none": 0.009441202922359183, "acc_norm,none": 0.6687710437710438, "acc_norm_stderr,none": 0.009657641311350914}, "boolq": {"alias": "boolq", "acc,none": 0.6859327217125383, "acc_stderr,none": 0.008117917728841496}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2113022113022113, "acc_stderr,none": 0.01168765590940137}, "copa": {"alias": "copa", "acc,none": 0.7, "acc_stderr,none": 0.046056618647183814}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4287990440151364, "acc_stderr,none": 0.004938930143234463, "acc_norm,none": 0.569806811392153, "acc_norm_stderr,none": 0.0049409117792733655}, "mmlu": {"acc,none": 0.26427859279304944, "acc_stderr,none": 0.0037168192670527967, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27885228480340063, "acc_stderr,none": 0.006536569462685405, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.03619604524124249}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.035243908445117836}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.03198001660115072}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.26582278481012656, "acc_stderr,none": 0.028756799629658335}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.33884297520661155, "acc_stderr,none": 0.04320767807536669}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26993865030674846, "acc_stderr,none": 0.034878251684978906}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.30346820809248554, "acc_stderr,none": 0.024752411960917205}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3086816720257235, "acc_stderr,none": 0.02623696588115327}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2654320987654321, "acc_stderr,none": 0.024569223600460852}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2861799217731421, "acc_stderr,none": 0.011543642878150757}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30994152046783624, "acc_stderr,none": 0.035469769593931624}, "mmlu_other": {"acc,none": 0.2574831026713872, "acc_stderr,none": 0.00782845851014528, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891363}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.03345036916788991}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.03063659134869981}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646035}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.27350427350427353, "acc_stderr,none": 0.029202540153431177}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.280970625798212, "acc_stderr,none": 0.016073127851221235}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.025646863097137897}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.15808823529411764, "acc_stderr,none": 0.02216146260806852}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2289156626506024, "acc_stderr,none": 0.03270745277352477}, "mmlu_social_sciences": {"acc,none": 0.24861878453038674, "acc_stderr,none": 0.00778923468934077, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2474747474747475, "acc_stderr,none": 0.03074630074212451}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24770642201834864, "acc_stderr,none": 0.01850814360254782}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596919}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28594771241830064, "acc_stderr,none": 0.018280485072954683}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24489795918367346, "acc_stderr,none": 0.027529637440174917}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2835820895522388, "acc_stderr,none": 0.03187187537919797}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_stem": {"acc,none": 0.26450999048525214, "acc_stderr,none": 0.007846243796093093, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.35555555555555557, "acc_stderr,none": 0.04135176749720385}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640767}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462457}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036843}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.042801058373643966}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23404255319148937, "acc_stderr,none": 0.02767845257821239}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21957671957671956, "acc_stderr,none": 0.021320018599770348}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.02489246917246284}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.030712730070982592}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844072}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.037101857261199946}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422266}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25892857142857145, "acc_stderr,none": 0.04157751539865629}, "mmlu_pro": {"exact_match,custom-extract": 0.11619015957446809, "exact_match_stderr,custom-extract": 0.0029099818996429173, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.17294281729428174, "exact_match_stderr,custom-extract": 0.014133916538056538}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09885931558935361, "exact_match_stderr,custom-extract": 0.010632661544075493}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06537102473498234, "exact_match_stderr,custom-extract": 0.007349892115635154}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1048780487804878, "exact_match_stderr,custom-extract": 0.015150318019731044}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1552132701421801, "exact_match_stderr,custom-extract": 0.012471657591396935}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1042311661506708, "exact_match_stderr,custom-extract": 0.009821076496344333}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.15647921760391198, "exact_match_stderr,custom-extract": 0.012710575019806393}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12335958005249344, "exact_match_stderr,custom-extract": 0.016869623436798514}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10535876475930972, "exact_match_stderr,custom-extract": 0.009256854730301902}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.09326424870466321, "exact_match_stderr,custom-extract": 0.007914638505968586}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1331168831168831, "exact_match_stderr,custom-extract": 0.011181395055683233}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13627254509018036, "exact_match_stderr,custom-extract": 0.015373681322287381}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09314857582755966, "exact_match_stderr,custom-extract": 0.008067125867388525}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.14786967418546365, "exact_match_stderr,custom-extract": 0.012573709084942288}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.26, "acc_stderr,none": 0.019635965529725512, "acc_norm,none": 0.38, "acc_norm_stderr,none": 0.02172888143870171}, "piqa": {"alias": "piqa", "acc,none": 0.690424374319913, "acc_stderr,none": 0.010786656752183345, "acc_norm,none": 0.690424374319913, "acc_norm_stderr,none": 0.010786656752183345}, "race": {"alias": "race", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.01488799043759141}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43091095189355166, "acc_stderr,none": 0.011205539177566719}, "winogrande": {"alias": "winogrande", "acc,none": 0.6172059984214681, "acc_stderr,none": 0.013660946109442015}} {"created_at": "2025-04-27T03:45:04.332233", "global_step": 332000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.32849829351535836, "acc_stderr,none": 0.013724978465537364, "acc_norm,none": 0.3822525597269625, "acc_norm_stderr,none": 0.014200454049979282}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6363636363636364, "acc_stderr,none": 0.009870849346011762, "acc_norm,none": 0.6144781144781145, "acc_norm_stderr,none": 0.009987250004629024}, "boolq": {"alias": "boolq", "acc,none": 0.7125382262996942, "acc_stderr,none": 0.007915651663295342}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22276822276822278, "acc_stderr,none": 0.011913022964039562}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.045126085985421276}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4307906791475802, "acc_stderr,none": 0.004941748817682299, "acc_norm,none": 0.563931487751444, "acc_norm_stderr,none": 0.004948824501355485}, "mmlu": {"acc,none": 0.2778094288562883, "acc_stderr,none": 0.0037778984484835108, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27396386822529223, "acc_stderr,none": 0.006502052686354819, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.040061680838488774}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.03213325717373616}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3080168776371308, "acc_stderr,none": 0.0300523893356057}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04065578140908705}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.04453197507374984}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.03462419931615623}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.023618678310069367}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23687150837988827, "acc_stderr,none": 0.014219570788103986}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.31511254019292606, "acc_stderr,none": 0.026385273703464496}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2932098765432099, "acc_stderr,none": 0.025329888171900926}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2757496740547588, "acc_stderr,none": 0.011413813609160982}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.03488647713457921}, "mmlu_other": {"acc,none": 0.28355326681686516, "acc_stderr,none": 0.008076376885218362, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165044}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2792452830188679, "acc_stderr,none": 0.027611163402399715}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.035506839891655796}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2645739910313901, "acc_stderr,none": 0.02960510321703831}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.045416094465039476}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.02948036054954119}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2886334610472541, "acc_stderr,none": 0.016203792703197793}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.026568921015457138}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.29432624113475175, "acc_stderr,none": 0.027187127011503796}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20220588235294118, "acc_stderr,none": 0.024398192986654924}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064537}, "mmlu_social_sciences": {"acc,none": 0.27624309392265195, "acc_stderr,none": 0.008055434765459207, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.03115626951964684}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27979274611398963, "acc_stderr,none": 0.03239637046735702}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.258974358974359, "acc_stderr,none": 0.022211106810061658}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.027025433498882374}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.30275229357798167, "acc_stderr,none": 0.01969871143475635}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3053435114503817, "acc_stderr,none": 0.04039314978724561}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.01798661530403032}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.04013964554072775}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.27346938775510204, "acc_stderr,none": 0.028535560337128448}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.35323383084577115, "acc_stderr,none": 0.03379790611796776}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.2794164287979702, "acc_stderr,none": 0.007987792802272544, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.362962962962963, "acc_stderr,none": 0.041539484047424004}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351586}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3402777777777778, "acc_stderr,none": 0.03962135573486219}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.04533838195929776}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.028809989854102953}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03855289616378948}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24867724867724866, "acc_stderr,none": 0.02226181769240018}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2709677419354839, "acc_stderr,none": 0.02528441611490016}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293752}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.035433042343899844}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.027920963147993662}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25, "acc_stderr,none": 0.04109974682633932}, "mmlu_pro": {"exact_match,custom-extract": 0.11876662234042554, "exact_match_stderr,custom-extract": 0.00293640154116964, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.18270571827057183, "exact_match_stderr,custom-extract": 0.014441383098049954}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11660329531051965, "exact_match_stderr,custom-extract": 0.011433262922605282}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.05918727915194346, "exact_match_stderr,custom-extract": 0.0070167253225116065}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.10975609756097561, "exact_match_stderr,custom-extract": 0.015456358358757447}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.15995260663507108, "exact_match_stderr,custom-extract": 0.012625069219392144}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10319917440660474, "exact_match_stderr,custom-extract": 0.00977796396738705}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14547677261613692, "exact_match_stderr,custom-extract": 0.012335243774582397}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14698162729658792, "exact_match_stderr,custom-extract": 0.018164310621441037}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11444141689373297, "exact_match_stderr,custom-extract": 0.009598512147633246}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.09104367135455219, "exact_match_stderr,custom-extract": 0.007829418466689903}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13528138528138528, "exact_match_stderr,custom-extract": 0.011257853023057833}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12224448897795591, "exact_match_stderr,custom-extract": 0.01467867164938674}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.09699769053117784, "exact_match_stderr,custom-extract": 0.008214625733066354}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.15413533834586465, "exact_match_stderr,custom-extract": 0.012790054353381214}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.258, "acc_stderr,none": 0.01958671178521584, "acc_norm,none": 0.384, "acc_norm_stderr,none": 0.021772369465547198}, "piqa": {"alias": "piqa", "acc,none": 0.704570184983678, "acc_stderr,none": 0.010644731559342464, "acc_norm,none": 0.7110990206746464, "acc_norm_stderr,none": 0.010575111841364901}, "race": {"alias": "race", "acc,none": 0.36076555023923446, "acc_stderr,none": 0.014862517074604974}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43500511770726713, "acc_stderr,none": 0.011218074465506494}, "winogrande": {"alias": "winogrande", "acc,none": 0.6227308602999211, "acc_stderr,none": 0.013622567928799501}} {"created_at": "2025-04-27T05:37:26.564174", "global_step": 334000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3302047781569966, "acc_stderr,none": 0.013743085603760427, "acc_norm,none": 0.3660409556313993, "acc_norm_stderr,none": 0.014077223108470137}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6073232323232324, "acc_stderr,none": 0.01002064655553869, "acc_norm,none": 0.5686026936026936, "acc_norm_stderr,none": 0.010162752847747494}, "boolq": {"alias": "boolq", "acc,none": 0.7256880733944954, "acc_stderr,none": 0.0078035079593829935}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20966420966420968, "acc_stderr,none": 0.011654350093704639}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4237203744274049, "acc_stderr,none": 0.004931372657129781, "acc_norm,none": 0.5656243776140211, "acc_norm_stderr,none": 0.004946617138983515}, "mmlu": {"acc,none": 0.23208944594787068, "acc_stderr,none": 0.0035569887011259317, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24335812964930925, "acc_stderr,none": 0.006254691480472895, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.040061680838488774}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03225078108306289}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25, "acc_stderr,none": 0.03039153369274154}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.029178682304842555}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094632}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.01442229220480886}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.18971061093247588, "acc_stderr,none": 0.022268196258783218}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.02289916291844581}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2438070404172099, "acc_stderr,none": 0.010966507972178473}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.24299967814612167, "acc_stderr,none": 0.007678647971181724, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899098}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398687}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.02417084087934102}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290396}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1875, "acc_stderr,none": 0.023709788253811766}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.034843315926805875}, "mmlu_social_sciences": {"acc,none": 0.2183945401364966, "acc_stderr,none": 0.0074461028380714035, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.19689119170984457, "acc_stderr,none": 0.028697873971860677}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20512820512820512, "acc_stderr,none": 0.02047323317355198}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.026653531596715484}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1926605504587156, "acc_stderr,none": 0.016909276884936094}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25, "acc_stderr,none": 0.01751781884501444}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721375}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.19183673469387755, "acc_stderr,none": 0.025206963154225423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.03036049015401466}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.21788772597526165, "acc_stderr,none": 0.00733238474743656, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2, "acc_stderr,none": 0.03455473702325436}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.02094048156533485}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1774193548387097, "acc_stderr,none": 0.02173254068932927}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15270935960591134, "acc_stderr,none": 0.025308904539380627}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.11893284574468085, "exact_match_stderr,custom-extract": 0.002936901996673425, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.19525801952580196, "exact_match_stderr,custom-extract": 0.014814133620953723}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.13181242078580482, "exact_match_stderr,custom-extract": 0.012050965508446783}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07243816254416961, "exact_match_stderr,custom-extract": 0.007707683029020946}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.15853658536585366, "exact_match_stderr,custom-extract": 0.01806012347189305}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.15876777251184834, "exact_match_stderr,custom-extract": 0.012587090061175905}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.08359133126934984, "exact_match_stderr,custom-extract": 0.008895851747412855}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.15403422982885084, "exact_match_stderr,custom-extract": 0.01262914611294394}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13910761154855644, "exact_match_stderr,custom-extract": 0.017752441192974866}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10172570390554042, "exact_match_stderr,custom-extract": 0.00911430369705992}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0851221317542561, "exact_match_stderr,custom-extract": 0.007595142426181129}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12337662337662338, "exact_match_stderr,custom-extract": 0.010824855641262726}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11422845691382766, "exact_match_stderr,custom-extract": 0.014253888115016483}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10546574287913779, "exact_match_stderr,custom-extract": 0.008525440942560929}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.13909774436090225, "exact_match_stderr,custom-extract": 0.012257666634016486}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.228, "acc_stderr,none": 0.018781306529363197, "acc_norm,none": 0.362, "acc_norm_stderr,none": 0.021513662527582404}, "piqa": {"alias": "piqa", "acc,none": 0.6958650707290533, "acc_stderr,none": 0.010733493335721307, "acc_norm,none": 0.6871599564744287, "acc_norm_stderr,none": 0.010817714425701086}, "race": {"alias": "race", "acc,none": 0.33588516746411484, "acc_stderr,none": 0.014617286312430689}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41606960081883315, "acc_stderr,none": 0.011153531906320234}, "winogrande": {"alias": "winogrande", "acc,none": 0.6187845303867403, "acc_stderr,none": 0.013650172164160305}} {"created_at": "2025-04-27T07:25:36.324061", "global_step": 336000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3361774744027304, "acc_stderr,none": 0.013804855026205758, "acc_norm,none": 0.3967576791808874, "acc_norm_stderr,none": 0.014296513020180633}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6439393939393939, "acc_stderr,none": 0.00982545460841631, "acc_norm,none": 0.6191077441077442, "acc_norm_stderr,none": 0.009964428212260377}, "boolq": {"alias": "boolq", "acc,none": 0.708868501529052, "acc_stderr,none": 0.007945477040745955}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.011704202814200253}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.04461960433384741}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.42421828321051586, "acc_stderr,none": 0.004932137126625415, "acc_norm,none": 0.5638319059948218, "acc_norm_stderr,none": 0.004948952519517516}, "mmlu": {"acc,none": 0.2410625267055975, "acc_stderr,none": 0.0036067832046536488, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.251009564293305, "acc_stderr,none": 0.006321788710555926, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.03970158273235173}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03346409881055953}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.03096451792692341}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.040261875275912046}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094633}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.03291099578615767}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2514450867052023, "acc_stderr,none": 0.023357365785874037}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19614147909967847, "acc_stderr,none": 0.022552447780478033}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.023788583551658537}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25945241199478486, "acc_stderr,none": 0.011195262076350314}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708312}, "mmlu_other": {"acc,none": 0.2513678789829417, "acc_stderr,none": 0.007773119455365178, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2528301886792453, "acc_stderr,none": 0.026749899771241238}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.03186209851641144}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3273542600896861, "acc_stderr,none": 0.03149384670994131}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.040580420156460364}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3034188034188034, "acc_stderr,none": 0.030118210106942645}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24393358876117496, "acc_stderr,none": 0.015357212665829468}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.024630048979824782}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.25177304964539005, "acc_stderr,none": 0.0258921511567094}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1948529411764706, "acc_stderr,none": 0.024060599423487424}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.034605799075530255}, "mmlu_social_sciences": {"acc,none": 0.22879428014299644, "acc_stderr,none": 0.007571746953652104, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.041424397194893624}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.20202020202020202, "acc_stderr,none": 0.028606204289229876}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.20256410256410257, "acc_stderr,none": 0.02037766097037139}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.20733944954128442, "acc_stderr,none": 0.01738141556360866}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306085}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2565359477124183, "acc_stderr,none": 0.017667841612379}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22857142857142856, "acc_stderr,none": 0.02688214492230774}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2736318407960199, "acc_stderr,none": 0.031524391865554016}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_stem": {"acc,none": 0.22803679035838884, "acc_stderr,none": 0.007466377616304861, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.032790004063100515}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292323}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135302}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21957671957671956, "acc_stderr,none": 0.021320018599770348}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2161290322580645, "acc_stderr,none": 0.023415293433568518}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.17733990147783252, "acc_stderr,none": 0.026874337276808352}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.02504044387700069}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.0347918557259966}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.025416428388767478}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.1171875, "exact_match_stderr,custom-extract": 0.002919180990117075, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.18131101813110181, "exact_match_stderr,custom-extract": 0.01439842736773424}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10012674271229405, "exact_match_stderr,custom-extract": 0.01069307487996213}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0627208480565371, "exact_match_stderr,custom-extract": 0.00720956625001502}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12926829268292683, "exact_match_stderr,custom-extract": 0.016589241600938227}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1528436018957346, "exact_match_stderr,custom-extract": 0.012393433537406419}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1021671826625387, "exact_match_stderr,custom-extract": 0.009734547484109863}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14180929095354522, "exact_match_stderr,custom-extract": 0.012204871709898311}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14173228346456693, "exact_match_stderr,custom-extract": 0.01789179783326823}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10899182561307902, "exact_match_stderr,custom-extract": 0.009395966618356956}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08216136195410807, "exact_match_stderr,custom-extract": 0.0074739484609544484}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11688311688311688, "exact_match_stderr,custom-extract": 0.010575091539720222}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14428857715430862, "exact_match_stderr,custom-extract": 0.015745808625512853}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10623556581986143, "exact_match_stderr,custom-extract": 0.008552816527360386}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16290726817042606, "exact_match_stderr,custom-extract": 0.013080605724028933}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.248, "acc_stderr,none": 0.019332342821239103, "acc_norm,none": 0.37, "acc_norm_stderr,none": 0.021613289165165785}, "piqa": {"alias": "piqa", "acc,none": 0.6936887921653971, "acc_stderr,none": 0.01075497003236732, "acc_norm,none": 0.6898803046789989, "acc_norm_stderr,none": 0.010791876566843037}, "race": {"alias": "race", "acc,none": 0.3550239234449761, "acc_stderr,none": 0.014809839887617086}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.42681678607983625, "acc_stderr,none": 0.011192223024107382}, "winogrande": {"alias": "winogrande", "acc,none": 0.6187845303867403, "acc_stderr,none": 0.0136501721641603}} {"created_at": "2025-04-27T08:48:42.237650", "global_step": 338000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.33447098976109213, "acc_stderr,none": 0.013787460322441374, "acc_norm,none": 0.3771331058020478, "acc_norm_stderr,none": 0.014163366896192596}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6422558922558923, "acc_stderr,none": 0.00983577275734336, "acc_norm,none": 0.6140572390572391, "acc_norm_stderr,none": 0.009989277329503951}, "boolq": {"alias": "boolq", "acc,none": 0.7137614678899082, "acc_stderr,none": 0.007905569067672587}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.24078624078624078, "acc_stderr,none": 0.012241029737913607}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.04461960433384741}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4267078271260705, "acc_stderr,none": 0.00493588266625047, "acc_norm,none": 0.5649273053176658, "acc_norm_stderr,none": 0.004947533158712093}, "mmlu": {"acc,none": 0.269690927218345, "acc_stderr,none": 0.0037435691998864325, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27035069075451645, "acc_stderr,none": 0.006473292982124956, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.03970158273235173}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624336}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.031822318676475544}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2616033755274262, "acc_stderr,none": 0.028609516716994934}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.32231404958677684, "acc_stderr,none": 0.042664163633521685}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507416}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.034624199316156234}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2976878612716763, "acc_stderr,none": 0.024617055388677003}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.33762057877813506, "acc_stderr,none": 0.026858825879488533}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.024659685185967284}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2627118644067797, "acc_stderr,none": 0.01124054551499567}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.24561403508771928, "acc_stderr,none": 0.0330140594698725}, "mmlu_other": {"acc,none": 0.2726102349533312, "acc_stderr,none": 0.007985567141642684, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2528301886792453, "acc_stderr,none": 0.026749899771241238}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.034140140070440354}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2825112107623318, "acc_stderr,none": 0.030216831011508766}, "mmlu_management": {"alias": " - management", "acc,none": 0.3300970873786408, "acc_stderr,none": 0.0465614711001235}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3076923076923077, "acc_stderr,none": 0.03023638994217309}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2567049808429119, "acc_stderr,none": 0.015620480263064535}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2908496732026144, "acc_stderr,none": 0.026004800363952113}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22794117647058823, "acc_stderr,none": 0.025483081468029804}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553027}, "mmlu_social_sciences": {"acc,none": 0.2674683132921677, "acc_stderr,none": 0.007983776843054131, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.030532892233932022}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.34196891191709844, "acc_stderr,none": 0.03423465100104284}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.24871794871794872, "acc_stderr,none": 0.0219169577092138}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2773109243697479, "acc_stderr,none": 0.029079374539480007}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25504587155963304, "acc_stderr,none": 0.01868850085653584}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768362}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.01784808957491322}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.043091187099464585}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24489795918367346, "acc_stderr,none": 0.02752963744017492}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2736318407960199, "acc_stderr,none": 0.03152439186555402}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_stem": {"acc,none": 0.2679987313669521, "acc_stderr,none": 0.007875469023527808, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3111111111111111, "acc_stderr,none": 0.03999262876617723}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.03715062154998905}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3194444444444444, "acc_stderr,none": 0.038990736873573344}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3446808510638298, "acc_stderr,none": 0.03106898596312215}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.02326651221373057}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2806451612903226, "acc_stderr,none": 0.025560604721022895}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.0316185633535861}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.026842057873833706}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.035118075718047245}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18981481481481483, "acc_stderr,none": 0.026744714834691926}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.12159242021276596, "exact_match_stderr,custom-extract": 0.0029650615335849524, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.199442119944212, "exact_match_stderr,custom-extract": 0.014933042396591397}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10266159695817491, "exact_match_stderr,custom-extract": 0.01081232338068658}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07243816254416961, "exact_match_stderr,custom-extract": 0.007707683029020959}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11219512195121951, "exact_match_stderr,custom-extract": 0.015605730293675839}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.14928909952606634, "exact_match_stderr,custom-extract": 0.012274145317879097}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10835913312693499, "exact_match_stderr,custom-extract": 0.00999056535282795}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1552567237163814, "exact_match_stderr,custom-extract": 0.01266999817209259}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11286089238845144, "exact_match_stderr,custom-extract": 0.01623214090346144}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1226158038147139, "exact_match_stderr,custom-extract": 0.00988944195822389}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0925240562546262, "exact_match_stderr,custom-extract": 0.007886385609194012}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.11688311688311688, "exact_match_stderr,custom-extract": 0.010575091539720215}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.11823647294589178, "exact_match_stderr,custom-extract": 0.014468953704661763}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10469591993841416, "exact_match_stderr,custom-extract": 0.008497923442833753}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.18421052631578946, "exact_match_stderr,custom-extract": 0.013731472440912697}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.238, "acc_stderr,none": 0.01906407295819845, "acc_norm,none": 0.356, "acc_norm_stderr,none": 0.02143471235607264}, "piqa": {"alias": "piqa", "acc,none": 0.6942328618063112, "acc_stderr,none": 0.010749627366141642, "acc_norm,none": 0.6920565832426551, "acc_norm_stderr,none": 0.010770892367463685}, "race": {"alias": "race", "acc,none": 0.3645933014354067, "acc_stderr,none": 0.014896354113839586}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41299897645854655, "acc_stderr,none": 0.01114147769803524}, "winogrande": {"alias": "winogrande", "acc,none": 0.6235201262825573, "acc_stderr,none": 0.01361693196066718}} {"created_at": "2025-04-27T10:46:54.237270", "global_step": 340000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3395904436860068, "acc_stderr,none": 0.013839039762820164, "acc_norm,none": 0.3924914675767918, "acc_norm_stderr,none": 0.014269634635670705}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.656986531986532, "acc_stderr,none": 0.009740965666489224, "acc_norm,none": 0.6367845117845118, "acc_norm_stderr,none": 0.009868397136118805}, "boolq": {"alias": "boolq", "acc,none": 0.6896024464831805, "acc_stderr,none": 0.008091910698229251}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.23177723177723178, "acc_stderr,none": 0.01208089355230227}, "copa": {"alias": "copa", "acc,none": 0.74, "acc_stderr,none": 0.0440844002276808}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4260107548297152, "acc_stderr,none": 0.004934846809827196, "acc_norm,none": 0.5664210316669986, "acc_norm_stderr,none": 0.00494555806985253}, "mmlu": {"acc,none": 0.2751032616436405, "acc_stderr,none": 0.0037649988719360167, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27885228480340063, "acc_stderr,none": 0.006536606957278581, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.15873015873015872, "acc_stderr,none": 0.032684540130117436}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.0340150671524904}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3088235294117647, "acc_stderr,none": 0.03242661719827218}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2911392405063291, "acc_stderr,none": 0.029571601065753374}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2975206611570248, "acc_stderr,none": 0.04173349148083498}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854933}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.294478527607362, "acc_stderr,none": 0.03581165790474082}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.02494679222527231}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.26256983240223464, "acc_stderr,none": 0.014716824273017754}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2797427652733119, "acc_stderr,none": 0.025494259350694905}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.025171041915309684}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2803129074315515, "acc_stderr,none": 0.01147155594495862}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.2813002896684905, "acc_stderr,none": 0.008044506918268702, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2792452830188679, "acc_stderr,none": 0.027611163402399715}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.0326926380614177}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3094170403587444, "acc_stderr,none": 0.03102441174057222}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.044986763205729224}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3034188034188034, "acc_stderr,none": 0.030118210106942656}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3128991060025543, "acc_stderr,none": 0.016580935940304048}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.026090162504279056}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.15441176470588236, "acc_stderr,none": 0.021950024722922026}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233133}, "mmlu_social_sciences": {"acc,none": 0.27071823204419887, "acc_stderr,none": 0.008004033361098818, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518752}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.031156269519646836}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24352331606217617, "acc_stderr,none": 0.030975436386845426}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.02144454730156049}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25630252100840334, "acc_stderr,none": 0.02835962087053395}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25321100917431194, "acc_stderr,none": 0.018644073041375053}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596919}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3104575163398693, "acc_stderr,none": 0.018718067052623223}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.31020408163265306, "acc_stderr,none": 0.029613459872484378}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2835820895522388, "acc_stderr,none": 0.03187187537919797}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_stem": {"acc,none": 0.26768157310497936, "acc_stderr,none": 0.007883672317848534, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3037037037037037, "acc_stderr,none": 0.039725528847851375}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.15, "acc_stderr,none": 0.035887028128263714}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2723404255319149, "acc_stderr,none": 0.02910129069838671}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.03752833958003337}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.02249451076750315}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2645161290322581, "acc_stderr,none": 0.02509189237885928}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.03108982600293752}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712173}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.036313298039696525}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422266}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3482142857142857, "acc_stderr,none": 0.045218299028335865}, "mmlu_pro": {"exact_match,custom-extract": 0.13331117021276595, "exact_match_stderr,custom-extract": 0.003084361626200313, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.2175732217573222, "exact_match_stderr,custom-extract": 0.015419420884296214}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11406844106463879, "exact_match_stderr,custom-extract": 0.011324518110214694}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0892226148409894, "exact_match_stderr,custom-extract": 0.008476416539386454}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12926829268292683, "exact_match_stderr,custom-extract": 0.016589241600938213}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17535545023696683, "exact_match_stderr,custom-extract": 0.013097223647424194}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10526315789473684, "exact_match_stderr,custom-extract": 0.009863889056501643}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.18337408312958436, "exact_match_stderr,custom-extract": 0.01353846818840947}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.15748031496062992, "exact_match_stderr,custom-extract": 0.018685785855939566}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1262488646684832, "exact_match_stderr,custom-extract": 0.010014085027799642}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10584752035529238, "exact_match_stderr,custom-extract": 0.008372964552650804}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1277056277056277, "exact_match_stderr,custom-extract": 0.010985901551102241}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13827655310621242, "exact_match_stderr,custom-extract": 0.015468334539576873}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11162432640492687, "exact_match_stderr,custom-extract": 0.008740583141383715}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16290726817042606, "exact_match_stderr,custom-extract": 0.013080605724028933}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.262, "acc_stderr,none": 0.01968468882019472, "acc_norm,none": 0.38, "acc_norm_stderr,none": 0.02172888143870171}, "piqa": {"alias": "piqa", "acc,none": 0.6936887921653971, "acc_stderr,none": 0.010754970032367318, "acc_norm,none": 0.6969532100108814, "acc_norm_stderr,none": 0.0107226486895315}, "race": {"alias": "race", "acc,none": 0.3674641148325359, "acc_stderr,none": 0.014921064308504981}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44114636642784033, "acc_stderr,none": 0.011235418947344597}, "winogrande": {"alias": "winogrande", "acc,none": 0.6053670086819258, "acc_stderr,none": 0.013736915172371883}} {"created_at": "2025-04-27T12:37:14.410322", "global_step": 342000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.34044368600682595, "acc_stderr,none": 0.013847460518892976, "acc_norm,none": 0.3848122866894198, "acc_norm_stderr,none": 0.014218371065251104}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6468855218855218, "acc_stderr,none": 0.009807078935467617, "acc_norm,none": 0.6334175084175084, "acc_norm_stderr,none": 0.009887786585323959}, "boolq": {"alias": "boolq", "acc,none": 0.7140672782874617, "acc_stderr,none": 0.00790303735916362}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20065520065520065, "acc_stderr,none": 0.011466011466011535}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.045604802157206845}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.42800238996215895, "acc_stderr,none": 0.004937779821908579, "acc_norm,none": 0.5689105755825533, "acc_norm_stderr,none": 0.004942164585991474}, "mmlu": {"acc,none": 0.2601481270474291, "acc_stderr,none": 0.0036980015646868136, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27587672688629117, "acc_stderr,none": 0.006508456541311156, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.15873015873015872, "acc_stderr,none": 0.03268454013011744}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.034277431758165236}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.28921568627450983, "acc_stderr,none": 0.03182231867647554}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29535864978902954, "acc_stderr,none": 0.02969633871342289}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.38016528925619836, "acc_stderr,none": 0.04431324501968432}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.04133119440243839}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.31901840490797545, "acc_stderr,none": 0.03661997551073836}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.26878612716763006, "acc_stderr,none": 0.023868003262500107}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3022508038585209, "acc_stderr,none": 0.02608270069539966}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2932098765432099, "acc_stderr,none": 0.02532988817190092}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2816166883963494, "acc_stderr,none": 0.011487783272786694}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.27485380116959063, "acc_stderr,none": 0.03424042924691582}, "mmlu_other": {"acc,none": 0.2442870936594786, "acc_stderr,none": 0.007700252276938819, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2188679245283019, "acc_stderr,none": 0.02544786382510861}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.23766816143497757, "acc_stderr,none": 0.028568079464714274}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24786324786324787, "acc_stderr,none": 0.028286324075564383}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2681992337164751, "acc_stderr,none": 0.015842430835269438}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.024954184324879905}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340460994}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.17279411764705882, "acc_stderr,none": 0.02296606758558176}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.031417842916639245}, "mmlu_social_sciences": {"acc,none": 0.2447188820279493, "acc_stderr,none": 0.007752784510244494, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.039994238792813365}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2474747474747475, "acc_stderr,none": 0.030746300742124505}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.029519282616817258}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.02102067268082791}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25630252100840334, "acc_stderr,none": 0.028359620870533953}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22385321100917432, "acc_stderr,none": 0.017871217767790215}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728744}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2826797385620915, "acc_stderr,none": 0.018217269552053446}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.038950910157241364}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24489795918367346, "acc_stderr,none": 0.027529637440174917}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.03076944496729601}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_stem": {"acc,none": 0.26736441484300666, "acc_stderr,none": 0.00787955658141289, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.32592592592592595, "acc_stderr,none": 0.040491220417025055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.03738520676119669}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2127659574468085, "acc_stderr,none": 0.026754391348039766}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2751322751322751, "acc_stderr,none": 0.023000086859068642}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25161290322580643, "acc_stderr,none": 0.024685979286239956}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763743}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863438}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467764}, "mmlu_pro": {"exact_match,custom-extract": 0.13139960106382978, "exact_match_stderr,custom-extract": 0.003068304056249306, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.200836820083682, "exact_match_stderr,custom-extract": 0.01497210589741097}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12547528517110265, "exact_match_stderr,custom-extract": 0.011800544058606085}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10070671378091872, "exact_match_stderr,custom-extract": 0.008948465856509545}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12195121951219512, "exact_match_stderr,custom-extract": 0.0161804554422017}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1623222748815166, "exact_match_stderr,custom-extract": 0.012700293472235506}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.11661506707946337, "exact_match_stderr,custom-extract": 0.010316078740894952}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.15403422982885084, "exact_match_stderr,custom-extract": 0.012629146112943938}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10236220472440945, "exact_match_stderr,custom-extract": 0.015549935163883108}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11171662125340599, "exact_match_stderr,custom-extract": 0.009498134639311172}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.09400444115470022, "exact_match_stderr,custom-extract": 0.007942739810804827}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1341991341991342, "exact_match_stderr,custom-extract": 0.011219745719880868}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13827655310621242, "exact_match_stderr,custom-extract": 0.015468334539576873}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.123941493456505, "exact_match_stderr,custom-extract": 0.00914613424852524}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.19423558897243107, "exact_match_stderr,custom-extract": 0.014013263342163263}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.256, "acc_stderr,none": 0.019536923574747605, "acc_norm,none": 0.38, "acc_norm_stderr,none": 0.02172888143870171}, "piqa": {"alias": "piqa", "acc,none": 0.6893362350380848, "acc_stderr,none": 0.010797078933727673, "acc_norm,none": 0.6942328618063112, "acc_norm_stderr,none": 0.010749627366141639}, "race": {"alias": "race", "acc,none": 0.3751196172248804, "acc_stderr,none": 0.014984183551431947}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.42528147389969295, "acc_stderr,none": 0.01118702758575761}, "winogrande": {"alias": "winogrande", "acc,none": 0.6258879242304657, "acc_stderr,none": 0.013599792958329826}} {"created_at": "2025-04-27T16:32:58.837221", "global_step": 346000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3575085324232082, "acc_stderr,none": 0.014005494275916571, "acc_norm,none": 0.4069965870307167, "acc_norm_stderr,none": 0.014356399418009123}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6881313131313131, "acc_stderr,none": 0.009505823345817666, "acc_norm,none": 0.6893939393939394, "acc_norm_stderr,none": 0.009495260551195607}, "boolq": {"alias": "boolq", "acc,none": 0.7189602446483181, "acc_stderr,none": 0.007861924290695656}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21457821457821458, "acc_stderr,none": 0.011753423094216842}, "copa": {"alias": "copa", "acc,none": 0.74, "acc_stderr,none": 0.04408440022768078}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.429097789285003, "acc_stderr,none": 0.004939358145561324, "acc_norm,none": 0.5732921728739295, "acc_norm_stderr,none": 0.004935882666250486}, "mmlu": {"acc,none": 0.2679105540521293, "acc_stderr,none": 0.0037320302443071243, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.26631243358129647, "acc_stderr,none": 0.006440382448129185, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.039325376803928724}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.03524390844511784}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3627450980392157, "acc_stderr,none": 0.03374499356319355}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.31223628691983124, "acc_stderr,none": 0.030165137867847018}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.21487603305785125, "acc_stderr,none": 0.03749492448709695}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.27607361963190186, "acc_stderr,none": 0.0351238528370505}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2797427652733119, "acc_stderr,none": 0.0254942593506949}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2654320987654321, "acc_stderr,none": 0.024569223600460842}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25488917861799215, "acc_stderr,none": 0.01113050981266298}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.034462962170884265}, "mmlu_other": {"acc,none": 0.2964274219504345, "acc_stderr,none": 0.008184235212389625, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3132075471698113, "acc_stderr,none": 0.02854479331905533}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.0332055644308557}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3721973094170404, "acc_stderr,none": 0.03244305283008732}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.04354631077260597}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3034188034188034, "acc_stderr,none": 0.030118210106942638}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3052362707535121, "acc_stderr,none": 0.016467711947635116}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.02678745311190654}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.025645553622266736}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2426470588235294, "acc_stderr,none": 0.026040662474201264}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.26506024096385544, "acc_stderr,none": 0.03436024037944967}, "mmlu_social_sciences": {"acc,none": 0.26259343516412087, "acc_stderr,none": 0.007930121756241144, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.041857744240220575}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.02912652283458684}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.31088082901554404, "acc_stderr,none": 0.03340361906276587}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23076923076923078, "acc_stderr,none": 0.021362027725222724}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25210084033613445, "acc_stderr,none": 0.02820554503327772}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26788990825688075, "acc_stderr,none": 0.018987462257978652}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.017986615304030316}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04607582090719976}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.24897959183673468, "acc_stderr,none": 0.027682979522960227}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24378109452736318, "acc_stderr,none": 0.030360490154014652}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_stem": {"acc,none": 0.24738344433872503, "acc_stderr,none": 0.007677758906132947, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3037037037037037, "acc_stderr,none": 0.03972552884785136}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.21710526315789475, "acc_stderr,none": 0.033550453048829226}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.022019080012217904}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2903225806451613, "acc_stderr,none": 0.025822106119415884}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.17733990147783252, "acc_stderr,none": 0.02687433727680835}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.026466117538959916}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987054}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2175925925925926, "acc_stderr,none": 0.028139689444859676}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340456}, "mmlu_pro": {"exact_match,custom-extract": 0.13372672872340424, "exact_match_stderr,custom-extract": 0.0030818223207992467, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.23709902370990238, "exact_match_stderr,custom-extract": 0.01589433846824644}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11406844106463879, "exact_match_stderr,custom-extract": 0.011324518110214694}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06713780918727916, "exact_match_stderr,custom-extract": 0.007441509249865656}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.12926829268292683, "exact_match_stderr,custom-extract": 0.016589241600938213}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.18838862559241706, "exact_match_stderr,custom-extract": 0.013467519528615371}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.11764705882352941, "exact_match_stderr,custom-extract": 0.010355570444404803}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.16625916870415647, "exact_match_stderr,custom-extract": 0.013025587168508141}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.01717316362524467}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09809264305177112, "exact_match_stderr,custom-extract": 0.008968149521850249}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.12435233160621761, "exact_match_stderr,custom-extract": 0.008981001727115759}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1341991341991342, "exact_match_stderr,custom-extract": 0.011219745719880865}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1282565130260521, "exact_match_stderr,custom-extract": 0.014983711363001532}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1170130869899923, "exact_match_stderr,custom-extract": 0.0089218929006093}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.18295739348370926, "exact_match_stderr,custom-extract": 0.013695193528694773}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.258, "acc_stderr,none": 0.019586711785215837, "acc_norm,none": 0.386, "acc_norm_stderr,none": 0.021793529219281165}, "piqa": {"alias": "piqa", "acc,none": 0.6958650707290533, "acc_stderr,none": 0.010733493335721316, "acc_norm,none": 0.705658324265506, "acc_norm_stderr,none": 0.0106333114703475}, "race": {"alias": "race", "acc,none": 0.3645933014354067, "acc_stderr,none": 0.014896354113839588}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44421699078812693, "acc_stderr,none": 0.011243437088559823}, "winogrande": {"alias": "winogrande", "acc,none": 0.6132596685082873, "acc_stderr,none": 0.013687214761883053}} {"created_at": "2025-04-27T20:13:04.013175", "global_step": 350000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.37457337883959047, "acc_stderr,none": 0.014144193471893437, "acc_norm,none": 0.4351535836177474, "acc_norm_stderr,none": 0.01448798619718605}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6948653198653199, "acc_stderr,none": 0.009448531094163907, "acc_norm,none": 0.680976430976431, "acc_norm_stderr,none": 0.009564133249441067}, "boolq": {"alias": "boolq", "acc,none": 0.6966360856269113, "acc_stderr,none": 0.008040396817430634}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21375921375921375, "acc_stderr,none": 0.011737086112127208}, "copa": {"alias": "copa", "acc,none": 0.76, "acc_stderr,none": 0.04292346959909282}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4311890061740689, "acc_stderr,none": 0.004942302768002107, "acc_norm,none": 0.5717984465245967, "acc_norm_stderr,none": 0.004938068627349492}, "mmlu": {"acc,none": 0.27253952428429, "acc_stderr,none": 0.0037526792287847868, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2682252922422954, "acc_stderr,none": 0.006461289944670812, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.040061680838488774}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.28484848484848485, "acc_stderr,none": 0.035243908445117836}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.03213325717373617}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.28270042194092826, "acc_stderr,none": 0.029312814153955917}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2644628099173554, "acc_stderr,none": 0.04026187527591205}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.0413311944024384}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.31901840490797545, "acc_stderr,none": 0.03661997551073836}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.023532925431044287}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225601}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2990353697749196, "acc_stderr,none": 0.026003301117885135}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.024659685185967284}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26988265971316816, "acc_stderr,none": 0.011337381084250408}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03218093795602357}, "mmlu_other": {"acc,none": 0.2642420341165111, "acc_stderr,none": 0.007894057300169368, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2339622641509434, "acc_stderr,none": 0.02605529690115292}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.033687629322594295}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.18385650224215247, "acc_stderr,none": 0.025998379092356517}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.045416094465039476}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623102}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28735632183908044, "acc_stderr,none": 0.0161824107306827}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.025553169991826503}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2553191489361702, "acc_stderr,none": 0.02601199293090202}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2536764705882353, "acc_stderr,none": 0.026431329870789555}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.031755547866299215}, "mmlu_social_sciences": {"acc,none": 0.2723431914202145, "acc_stderr,none": 0.008026832359453557, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022056}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21717171717171718, "acc_stderr,none": 0.029376616484945633}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27461139896373055, "acc_stderr,none": 0.03221024508041154}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.258974358974359, "acc_stderr,none": 0.022211106810061682}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.27310924369747897, "acc_stderr,none": 0.028942004040998167}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26422018348623855, "acc_stderr,none": 0.018904164171510196}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728745}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.018054027458815198}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.044612721759105085}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.33877551020408164, "acc_stderr,none": 0.030299506562154185}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_stem": {"acc,none": 0.2873453853472883, "acc_stderr,none": 0.008030116483595804, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.34074074074074073, "acc_stderr,none": 0.040943762699967946}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.34210526315789475, "acc_stderr,none": 0.038607315993160925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2847222222222222, "acc_stderr,none": 0.03773809990686934}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.42, "acc_stderr,none": 0.04960449637488584}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.0379328118530781}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.028659179374292323}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.21379310344827587, "acc_stderr,none": 0.034165204477475494}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525218}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3193548387096774, "acc_stderr,none": 0.02652270967466777}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3054187192118227, "acc_stderr,none": 0.03240661565868408}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.026962424325073828}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2052980132450331, "acc_stderr,none": 0.03297986648473836}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03214952147802748}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.20535714285714285, "acc_stderr,none": 0.03834241021419073}, "mmlu_pro": {"exact_match,custom-extract": 0.12342087765957446, "exact_match_stderr,custom-extract": 0.002985033256051767, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.17154811715481172, "exact_match_stderr,custom-extract": 0.014088673719425003}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12420785804816223, "exact_match_stderr,custom-extract": 0.011749298825998155}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06448763250883392, "exact_match_stderr,custom-extract": 0.007303510883881922}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13414634146341464, "exact_match_stderr,custom-extract": 0.016851944127279108}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17061611374407584, "exact_match_stderr,custom-extract": 0.012956092265003955}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10319917440660474, "exact_match_stderr,custom-extract": 0.009777963967387037}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14058679706601468, "exact_match_stderr,custom-extract": 0.012160802933047539}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.09711286089238845, "exact_match_stderr,custom-extract": 0.015190193611399451}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12079927338782924, "exact_match_stderr,custom-extract": 0.009826069635820892}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08660251665433012, "exact_match_stderr,custom-extract": 0.007654701811618176}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12987012987012986, "exact_match_stderr,custom-extract": 0.011064857512116038}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14428857715430862, "exact_match_stderr,custom-extract": 0.015745808625512853}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11778290993071594, "exact_match_stderr,custom-extract": 0.008947290267287298}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.18170426065162906, "exact_match_stderr,custom-extract": 0.013658674004058884}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.256, "acc_stderr,none": 0.019536923574747605, "acc_norm,none": 0.398, "acc_norm_stderr,none": 0.02191237788577997}, "piqa": {"alias": "piqa", "acc,none": 0.6964091403699674, "acc_stderr,none": 0.010728079893076364, "acc_norm,none": 0.6964091403699674, "acc_norm_stderr,none": 0.010728079893076357}, "race": {"alias": "race", "acc,none": 0.3521531100478469, "acc_stderr,none": 0.014782629897202254}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43807574206755373, "acc_stderr,none": 0.01122696506802993}, "winogrande": {"alias": "winogrande", "acc,none": 0.6211523283346487, "acc_stderr,none": 0.013633724603180318}} {"created_at": "2025-04-27T21:41:15.808794", "global_step": 352000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3643344709897611, "acc_stderr,none": 0.014063260279882413, "acc_norm,none": 0.4044368600682594, "acc_norm_stderr,none": 0.014342036483436172}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6734006734006734, "acc_stderr,none": 0.00962304703826764, "acc_norm,none": 0.6628787878787878, "acc_norm_stderr,none": 0.009700146509130075}, "boolq": {"alias": "boolq", "acc,none": 0.6880733944954128, "acc_stderr,none": 0.008102818891778078}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22276822276822278, "acc_stderr,none": 0.011913022964039548}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.433877713602868, "acc_stderr,none": 0.004945956744943811, "acc_norm,none": 0.5787691694881498, "acc_norm_stderr,none": 0.004927473370720143}, "mmlu": {"acc,none": 0.2817974647486113, "acc_stderr,none": 0.003790018675476917, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2639744952178533, "acc_stderr,none": 0.006426340611557341, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.3253968253968254, "acc_stderr,none": 0.041905964388711366}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.036085410115739666}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.03132179803083293}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.21518987341772153, "acc_stderr,none": 0.026750826994676173}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2231404958677686, "acc_stderr,none": 0.03800754475228733}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.038935425188248475}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.03462419931615624}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.023532925431044276}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25251396648044694, "acc_stderr,none": 0.014530330201468662}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2733118971061093, "acc_stderr,none": 0.025311765975426122}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.27469135802469136, "acc_stderr,none": 0.024836057868294677}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27444589308996087, "acc_stderr,none": 0.011397043163078154}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946956}, "mmlu_other": {"acc,none": 0.28741551335693594, "acc_stderr,none": 0.008085743916042285, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.30566037735849055, "acc_stderr,none": 0.028353298073322666}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3583815028901734, "acc_stderr,none": 0.0365634365335316}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.18834080717488788, "acc_stderr,none": 0.02624113299640727}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.04354631077260595}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.030572811310299607}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2796934865900383, "acc_stderr,none": 0.016050792148036525}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.026336613469046644}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24822695035460993, "acc_stderr,none": 0.025770015644290403}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3897058823529412, "acc_stderr,none": 0.029624663581159696}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.0332939411907353}, "mmlu_social_sciences": {"acc,none": 0.30679233019174523, "acc_stderr,none": 0.008312085053658384, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.04142439719489361}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.29292929292929293, "acc_stderr,none": 0.03242497958178817}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.34196891191709844, "acc_stderr,none": 0.03423465100104283}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2923076923076923, "acc_stderr,none": 0.023060438380857733}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3445378151260504, "acc_stderr,none": 0.03086868260412163}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.326605504587156, "acc_stderr,none": 0.020106990889937306}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.03880848301082397}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.28431372549019607, "acc_stderr,none": 0.01824902441120766}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721377}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3551020408163265, "acc_stderr,none": 0.030635655150387634}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31840796019900497, "acc_stderr,none": 0.03294118479054096}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_stem": {"acc,none": 0.278464954012052, "acc_stderr,none": 0.007978595346692968, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.03852084696008534}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.04576665403207762}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.251063829787234, "acc_stderr,none": 0.02834696377716245}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.022569897074918417}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3, "acc_stderr,none": 0.026069362295335144}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.03127090713297698}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21851851851851853, "acc_stderr,none": 0.02519575225182379}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31125827814569534, "acc_stderr,none": 0.03780445850526732}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.36574074074074076, "acc_stderr,none": 0.03284738857647207}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340456}, "mmlu_pro": {"exact_match,custom-extract": 0.1254155585106383, "exact_match_stderr,custom-extract": 0.0030021632590520922, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.18688981868898186, "exact_match_stderr,custom-extract": 0.014568371570721193}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10646387832699619, "exact_match_stderr,custom-extract": 0.010987378600044825}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07332155477031801, "exact_match_stderr,custom-extract": 0.007750845159494202}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.14634146341463414, "exact_match_stderr,custom-extract": 0.017476889350508576}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.19075829383886256, "exact_match_stderr,custom-extract": 0.013532157875306485}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09391124871001032, "exact_match_stderr,custom-extract": 0.009375760359013255}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.14547677261613692, "exact_match_stderr,custom-extract": 0.012335243774582429}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11286089238845144, "exact_match_stderr,custom-extract": 0.016232140903461433}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10535876475930972, "exact_match_stderr,custom-extract": 0.009256854730301894}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0917838638045892, "exact_match_stderr,custom-extract": 0.007857979485361452}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.14502164502164502, "exact_match_stderr,custom-extract": 0.011590258522971733}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1462925851703407, "exact_match_stderr,custom-extract": 0.015836201263905444}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1100846805234796, "exact_match_stderr,custom-extract": 0.008687612438998108}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.18045112781954886, "exact_match_stderr,custom-extract": 0.013621911931802996}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.258, "acc_stderr,none": 0.019586711785215837, "acc_norm,none": 0.372, "acc_norm_stderr,none": 0.0216371979857224}, "piqa": {"alias": "piqa", "acc,none": 0.6920565832426551, "acc_stderr,none": 0.010770892367463682, "acc_norm,none": 0.7018498367791077, "acc_norm_stderr,none": 0.010672964114008301}, "race": {"alias": "race", "acc,none": 0.3617224880382775, "acc_stderr,none": 0.014871072026717747}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43398157625383826, "acc_stderr,none": 0.011215013703683814}, "winogrande": {"alias": "winogrande", "acc,none": 0.6124704025256511, "acc_stderr,none": 0.01369235463601677}} {"created_at": "2025-04-27T23:37:42.849179", "global_step": 354000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.35580204778157, "acc_stderr,none": 0.01399057113791876, "acc_norm,none": 0.4052901023890785, "acc_norm_stderr,none": 0.01434686906022933}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6910774410774411, "acc_stderr,none": 0.00948104838776135, "acc_norm,none": 0.6822390572390572, "acc_norm_stderr,none": 0.009554033064443062}, "boolq": {"alias": "boolq", "acc,none": 0.7345565749235474, "acc_stderr,none": 0.0077230909835904705}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2153972153972154, "acc_stderr,none": 0.011769690686226967}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4342760406293567, "acc_stderr,none": 0.004946485466544621, "acc_norm,none": 0.5718980282812188, "acc_norm_stderr,none": 0.00493792432674258}, "mmlu": {"acc,none": 0.2839339125480701, "acc_stderr,none": 0.003790112482847812, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25759829968119025, "acc_stderr,none": 0.006372354532171793, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.04104947269903394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3393939393939394, "acc_stderr,none": 0.03697442205031596}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.03256685484460388}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24472573839662448, "acc_stderr,none": 0.027985699387036416}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.04139112727635461}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.03957835471980979}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212696}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.22254335260115607, "acc_stderr,none": 0.02239421566194282}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2829581993569132, "acc_stderr,none": 0.025583062489984827}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24382716049382716, "acc_stderr,none": 0.023891879541959607}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25749674054758803, "acc_stderr,none": 0.011167706014904156}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.032180937956023566}, "mmlu_other": {"acc,none": 0.29127775989700677, "acc_stderr,none": 0.008143078852547598, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3132075471698113, "acc_stderr,none": 0.028544793319055326}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.32947976878612717, "acc_stderr,none": 0.03583901754736411}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.26905829596412556, "acc_stderr,none": 0.029763779406874972}, "mmlu_management": {"alias": " - management", "acc,none": 0.36893203883495146, "acc_stderr,none": 0.047776151811567386}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623102}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2707535121328225, "acc_stderr,none": 0.015889888362560486}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2973856209150327, "acc_stderr,none": 0.026173908506718576}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24468085106382978, "acc_stderr,none": 0.025645553622266733}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3639705882352941, "acc_stderr,none": 0.02922719246003203}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.303542411439714, "acc_stderr,none": 0.008261698180047591, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281337}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03191178226713547}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.31088082901554404, "acc_stderr,none": 0.03340361906276587}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.33076923076923076, "acc_stderr,none": 0.02385479568097112}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.031041941304059288}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.29541284403669726, "acc_stderr,none": 0.019560619182976}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.03880848301082396}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.26633986928104575, "acc_stderr,none": 0.01788318813466719}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884601}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.40816326530612246, "acc_stderr,none": 0.03146465712827423}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.34328358208955223, "acc_stderr,none": 0.03357379665433431}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.29686013320647003, "acc_stderr,none": 0.00806892615616774, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.19, "acc_stderr,none": 0.039427724440366234}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.03547854198560824}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.32894736842105265, "acc_stderr,none": 0.038234289699266046}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3125, "acc_stderr,none": 0.038760854559127644}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.04655010411319615}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20851063829787234, "acc_stderr,none": 0.026556982117838728}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03855289616378948}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.022569897074918424}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3419354838709677, "acc_stderr,none": 0.02698528957655273}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.32019704433497537, "acc_stderr,none": 0.0328264938530415}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.026842057873833706}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.3841059602649007, "acc_stderr,none": 0.03971301814719198}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4444444444444444, "acc_stderr,none": 0.03388857118502325}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.15178571428571427, "acc_stderr,none": 0.03405702838185691}, "mmlu_pro": {"exact_match,custom-extract": 0.12400265957446809, "exact_match_stderr,custom-extract": 0.0029813654329394123, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.199442119944212, "exact_match_stderr,custom-extract": 0.014933042396591396}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10646387832699619, "exact_match_stderr,custom-extract": 0.010987378600044838}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06713780918727916, "exact_match_stderr,custom-extract": 0.0074415092498656375}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.15365853658536585, "exact_match_stderr,custom-extract": 0.01783156665820722}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1966824644549763, "exact_match_stderr,custom-extract": 0.013690290289032438}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09907120743034056, "exact_match_stderr,custom-extract": 0.009602432935115167}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.17603911980440098, "exact_match_stderr,custom-extract": 0.013324375473773221}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11811023622047244, "exact_match_stderr,custom-extract": 0.01655614119804242}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10626702997275204, "exact_match_stderr,custom-extract": 0.009291949023141264}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10288675055514433, "exact_match_stderr,custom-extract": 0.00826868555613196}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1277056277056277, "exact_match_stderr,custom-extract": 0.010985901551102233}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13226452905811623, "exact_match_stderr,custom-extract": 0.015181011139551245}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.07467282525019246, "exact_match_stderr,custom-extract": 0.007296113874717256}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.17293233082706766, "exact_match_stderr,custom-extract": 0.013396133254614466}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.252, "acc_stderr,none": 0.01943572728224952, "acc_norm,none": 0.382, "acc_norm_stderr,none": 0.021750820591250834}, "piqa": {"alias": "piqa", "acc,none": 0.6871599564744287, "acc_stderr,none": 0.0108177144257011, "acc_norm,none": 0.6985854189336235, "acc_norm_stderr,none": 0.010706248242753761}, "race": {"alias": "race", "acc,none": 0.3799043062200957, "acc_stderr,none": 0.01502160080493565}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4390992835209826, "acc_stderr,none": 0.011229831502847804}, "winogrande": {"alias": "winogrande", "acc,none": 0.6227308602999211, "acc_stderr,none": 0.013622567928799501}} {"created_at": "2025-04-28T01:34:32.186262", "global_step": 356000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3464163822525597, "acc_stderr,none": 0.013905011180063239, "acc_norm,none": 0.3924914675767918, "acc_norm_stderr,none": 0.014269634635670709}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6721380471380471, "acc_stderr,none": 0.009632587076170011, "acc_norm,none": 0.6430976430976431, "acc_norm_stderr,none": 0.009830630210347016}, "boolq": {"alias": "boolq", "acc,none": 0.6932721712538227, "acc_stderr,none": 0.008065309051771783}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21785421785421785, "acc_stderr,none": 0.011818079981132525}, "copa": {"alias": "copa", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4355706034654451, "acc_stderr,none": 0.00494818136702494, "acc_norm,none": 0.577275443138817, "acc_norm_stderr,none": 0.004929828337606974}, "mmlu": {"acc,none": 0.259792052414186, "acc_stderr,none": 0.0036965480296688247, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.25377258235919237, "acc_stderr,none": 0.006340367871161165, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.0361960452412425}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624337}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.03198001660115069}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.31645569620253167, "acc_stderr,none": 0.03027497488021897}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302872}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.04236511258094633}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2630057803468208, "acc_stderr,none": 0.023703099525258172}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3054662379421222, "acc_stderr,none": 0.026160584450140478}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.023132376234543332}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24445893089960888, "acc_stderr,none": 0.01097642501311389}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.03126781714663179}, "mmlu_other": {"acc,none": 0.28934663662697135, "acc_stderr,none": 0.0081247632916482, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2943396226415094, "acc_stderr,none": 0.028049186315695248}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.03435568056047873}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.37668161434977576, "acc_stderr,none": 0.032521134899291884}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.044986763205729224}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3418803418803419, "acc_stderr,none": 0.03107502852650776}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2835249042145594, "acc_stderr,none": 0.016117318166832272}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.024404394928087866}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.02699219917306436}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23897058823529413, "acc_stderr,none": 0.025905280644893006}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.03629335329947861}, "mmlu_social_sciences": {"acc,none": 0.2577185570360741, "acc_stderr,none": 0.007895639275638942, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.041857744240220575}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23232323232323232, "acc_stderr,none": 0.030088629490217483}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2538860103626943, "acc_stderr,none": 0.03141024780565318}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.022139081103971538}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.28991596638655465, "acc_stderr,none": 0.029472485833136084}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.25871559633027524, "acc_stderr,none": 0.01877605231961962}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596919}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.017952449196987866}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.04172343038705383}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23673469387755103, "acc_stderr,none": 0.02721283588407315}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.03076944496729601}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_stem": {"acc,none": 0.241674595623216, "acc_stderr,none": 0.007617733906661431, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.03712537833614865}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19736842105263158, "acc_stderr,none": 0.03238981601699397}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.035146974678623884}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.15, "acc_stderr,none": 0.03588702812826371}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.04158307533083286}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3148936170212766, "acc_stderr,none": 0.030363582197238174}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.0220190800122179}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.267741935483871, "acc_stderr,none": 0.02518900666021238}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.24630541871921183, "acc_stderr,none": 0.030315099285617708}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2037037037037037, "acc_stderr,none": 0.024556172219141265}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.026991454502036712}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.12691156914893617, "exact_match_stderr,custom-extract": 0.003022744785205824, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.16736401673640167, "exact_match_stderr,custom-extract": 0.013950896661189977}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11913814955640051, "exact_match_stderr,custom-extract": 0.011540276571470737}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07685512367491167, "exact_match_stderr,custom-extract": 0.007920271010098666}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11951219512195121, "exact_match_stderr,custom-extract": 0.016040065235546762}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1670616113744076, "exact_match_stderr,custom-extract": 0.012847865601311619}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09494324045407637, "exact_match_stderr,custom-extract": 0.00942176471567824}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.18092909535452323, "exact_match_stderr,custom-extract": 0.01346802541043788}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.10761154855643044, "exact_match_stderr,custom-extract": 0.015896979452723368}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12352406902815623, "exact_match_stderr,custom-extract": 0.009920862929791524}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.09548482605477424, "exact_match_stderr,custom-extract": 0.007998494027144747}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1396103896103896, "exact_match_stderr,custom-extract": 0.011407897167896806}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13827655310621242, "exact_match_stderr,custom-extract": 0.015468334539576873}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12086220169361046, "exact_match_stderr,custom-extract": 0.009047662268420943}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.16917293233082706, "exact_match_stderr,custom-extract": 0.013279801895764033}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.254, "acc_stderr,none": 0.01948659680164338, "acc_norm,none": 0.38, "acc_norm_stderr,none": 0.02172888143870171}, "piqa": {"alias": "piqa", "acc,none": 0.690424374319913, "acc_stderr,none": 0.010786656752183345, "acc_norm,none": 0.70620239390642, "acc_norm_stderr,none": 0.010627574080514797}, "race": {"alias": "race", "acc,none": 0.369377990430622, "acc_stderr,none": 0.014937221457864277}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43091095189355166, "acc_stderr,none": 0.01120553917756672}, "winogrande": {"alias": "winogrande", "acc,none": 0.6069455406471981, "acc_stderr,none": 0.01372727624910844}} {"created_at": "2025-04-28T03:49:46.252305", "global_step": 358000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3378839590443686, "acc_stderr,none": 0.01382204792228351, "acc_norm,none": 0.40187713310580203, "acc_norm_stderr,none": 0.014327268614578276}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6742424242424242, "acc_stderr,none": 0.009616642976885963, "acc_norm,none": 0.6574074074074074, "acc_norm_stderr,none": 0.009738105469984187}, "boolq": {"alias": "boolq", "acc,none": 0.6483180428134556, "acc_stderr,none": 0.008351445237661383}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2203112203112203, "acc_stderr,none": 0.01186585494340244}, "copa": {"alias": "copa", "acc,none": 0.67, "acc_stderr,none": 0.04725815626252607}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.43238398725353516, "acc_stderr,none": 0.004943945069611446, "acc_norm,none": 0.5764787890858395, "acc_norm_stderr,none": 0.004931065434173696}, "mmlu": {"acc,none": 0.2766699900299103, "acc_stderr,none": 0.0037667272286089916, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2786397449521785, "acc_stderr,none": 0.006526378345632879, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848878}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.36764705882352944, "acc_stderr,none": 0.03384132045674118}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3670886075949367, "acc_stderr,none": 0.03137624072561619}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.30578512396694213, "acc_stderr,none": 0.04205953933884123}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3148148148148148, "acc_stderr,none": 0.04489931073591311}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3128834355828221, "acc_stderr,none": 0.036429145782924055}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.014310999547961447}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.26366559485530544, "acc_stderr,none": 0.025025538500532338}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02438366553103545}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.27509778357235987, "acc_stderr,none": 0.011405443620996932}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.28654970760233917, "acc_stderr,none": 0.03467826685703826}, "mmlu_other": {"acc,none": 0.3009333762471838, "acc_stderr,none": 0.008200055670980819, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.4, "acc_stderr,none": 0.04923659639173309}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3018867924528302, "acc_stderr,none": 0.028254200344438655}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.30057803468208094, "acc_stderr,none": 0.03496101481191181}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.37, "acc_stderr,none": 0.048523658709390974}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.34977578475336324, "acc_stderr,none": 0.03200736719484503}, "mmlu_management": {"alias": " - management", "acc,none": 0.24271844660194175, "acc_stderr,none": 0.04245022486384495}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3547008547008547, "acc_stderr,none": 0.03134250486245402}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3243933588761175, "acc_stderr,none": 0.016740929047162696}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.02526169121972948}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340461008}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1948529411764706, "acc_stderr,none": 0.024060599423487414}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2710843373493976, "acc_stderr,none": 0.03460579907553026}, "mmlu_social_sciences": {"acc,none": 0.2720181995450114, "acc_stderr,none": 0.008019978886083749, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2474747474747475, "acc_stderr,none": 0.0307463007421245}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.26424870466321243, "acc_stderr,none": 0.031821550509166484}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2717948717948718, "acc_stderr,none": 0.022556551010132368}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279483}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26972477064220185, "acc_stderr,none": 0.01902848671111545}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.03768335959728744}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.01852175621542302}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.23265306122448978, "acc_stderr,none": 0.02704925791589618}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.32338308457711445, "acc_stderr,none": 0.03307615947979035}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_stem": {"acc,none": 0.25436092610212496, "acc_stderr,none": 0.007746447967555467, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04072314811876837}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.035834961763610625}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03800968060554858}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171452}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2723404255319149, "acc_stderr,none": 0.0291012906983867}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.21379310344827587, "acc_stderr,none": 0.03416520447747548}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.022860838309232072}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27741935483870966, "acc_stderr,none": 0.02547019683590005}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1921182266009852, "acc_stderr,none": 0.02771931570961478}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.025928876132766097}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.03543304234389985}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19907407407407407, "acc_stderr,none": 0.02723229846269024}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.1260804521276596, "exact_match_stderr,custom-extract": 0.003003378717388802, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.20920502092050208, "exact_match_stderr,custom-extract": 0.015200626647538911}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10899873257287707, "exact_match_stderr,custom-extract": 0.011101630697795408}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0715547703180212, "exact_match_stderr,custom-extract": 0.007664187803003869}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.15365853658536585, "exact_match_stderr,custom-extract": 0.01783156665820722}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1848341232227488, "exact_match_stderr,custom-extract": 0.013369041898186952}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09597523219814241, "exact_match_stderr,custom-extract": 0.009467429322574824}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1821515892420538, "exact_match_stderr,custom-extract": 0.01350336046749538}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12598425196850394, "exact_match_stderr,custom-extract": 0.01702260263856952}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09627611262488647, "exact_match_stderr,custom-extract": 0.008893665915732372}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.08660251665433012, "exact_match_stderr,custom-extract": 0.007654701811618188}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13203463203463203, "exact_match_stderr,custom-extract": 0.011142798517705924}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15831663326653306, "exact_match_stderr,custom-extract": 0.016357727678825804}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10007698229407236, "exact_match_stderr,custom-extract": 0.008329758962147563}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.17167919799498746, "exact_match_stderr,custom-extract": 0.013357616212456415}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.27, "acc_stderr,none": 0.01987435483128749, "acc_norm,none": 0.378, "acc_norm_stderr,none": 0.021706550824518184}, "piqa": {"alias": "piqa", "acc,none": 0.6936887921653971, "acc_stderr,none": 0.010754970032367318, "acc_norm,none": 0.70620239390642, "acc_norm_stderr,none": 0.010627574080514799}, "race": {"alias": "race", "acc,none": 0.3645933014354067, "acc_stderr,none": 0.014896354113839586}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43244626407369496, "acc_stderr,none": 0.011210331273967561}, "winogrande": {"alias": "winogrande", "acc,none": 0.6124704025256511, "acc_stderr,none": 0.01369235463601677}} {"created_at": "2025-04-28T05:31:41.477272", "global_step": 360000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3438566552901024, "acc_stderr,none": 0.013880644570156211, "acc_norm,none": 0.3967576791808874, "acc_norm_stderr,none": 0.014296513020180628}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6675084175084175, "acc_stderr,none": 0.009666892606130127, "acc_norm,none": 0.6485690235690236, "acc_norm_stderr,none": 0.00979639558281772}, "boolq": {"alias": "boolq", "acc,none": 0.7290519877675841, "acc_stderr,none": 0.007773467255881215}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.24488124488124488, "acc_stderr,none": 0.012311344255040359}, "copa": {"alias": "copa", "acc,none": 0.68, "acc_stderr,none": 0.04688261722621504}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4362676757618004, "acc_stderr,none": 0.004949080334816036, "acc_norm,none": 0.5804620593507269, "acc_norm_stderr,none": 0.004924748500639343}, "mmlu": {"acc,none": 0.26378008830650906, "acc_stderr,none": 0.003709119387579032, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2678002125398512, "acc_stderr,none": 0.006436422139490316, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.038522733649243156}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624336}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.03256685484460389}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4092827004219409, "acc_stderr,none": 0.032007041833595914}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.0398497965330287}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.04524596007030048}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25766871165644173, "acc_stderr,none": 0.03436150827846917}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28034682080924855, "acc_stderr,none": 0.02418242749657761}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2446927374301676, "acc_stderr,none": 0.014378169884098417}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.19614147909967847, "acc_stderr,none": 0.022552447780478022}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24382716049382716, "acc_stderr,none": 0.023891879541959607}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2666232073011734, "acc_stderr,none": 0.011293836031612135}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.27485380116959063, "acc_stderr,none": 0.03424042924691584}, "mmlu_other": {"acc,none": 0.28644995172191823, "acc_stderr,none": 0.008097203373421524, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3018867924528302, "acc_stderr,none": 0.02825420034443866}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.27167630057803466, "acc_stderr,none": 0.03391750322321659}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3632286995515695, "acc_stderr,none": 0.032277904428505}, "mmlu_management": {"alias": " - management", "acc,none": 0.24271844660194175, "acc_stderr,none": 0.04245022486384495}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.29914529914529914, "acc_stderr,none": 0.02999695185834948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2771392081736909, "acc_stderr,none": 0.016005636294122428}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912255}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.025767252010855966}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3253012048192771, "acc_stderr,none": 0.03647168523683227}, "mmlu_social_sciences": {"acc,none": 0.24764380890477738, "acc_stderr,none": 0.007784042500719237, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.0298575156733864}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24352331606217617, "acc_stderr,none": 0.03097543638684542}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2230769230769231, "acc_stderr,none": 0.021107730127244}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23949579831932774, "acc_stderr,none": 0.02772206549336126}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24220183486238533, "acc_stderr,none": 0.01836817630659862}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467765}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.017917974069594726}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940589}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.20408163265306123, "acc_stderr,none": 0.025801283475090506}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2835820895522388, "acc_stderr,none": 0.03187187537919797}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_stem": {"acc,none": 0.25118934348239774, "acc_stderr,none": 0.0076969623752900646, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3111111111111111, "acc_stderr,none": 0.03999262876617722}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.03279000406310051}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03800968060554858}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768081}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2680851063829787, "acc_stderr,none": 0.028957342788342347}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.023068188848261124}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.24516129032258063, "acc_stderr,none": 0.024472243840895525}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.02850137816789395}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.02438843043398766}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2185430463576159, "acc_stderr,none": 0.03374235550425694}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.17592592592592593, "acc_stderr,none": 0.025967420958258533}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.38392857142857145, "acc_stderr,none": 0.04616143075028547}, "mmlu_pro": {"exact_match,custom-extract": 0.14353390957446807, "exact_match_stderr,custom-extract": 0.0031710513446770398, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.2384937238493724, "exact_match_stderr,custom-extract": 0.01592643999671732}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11913814955640051, "exact_match_stderr,custom-extract": 0.011540276571470737}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07243816254416961, "exact_match_stderr,custom-extract": 0.007707683029020952}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.16341463414634147, "exact_match_stderr,custom-extract": 0.018282641806528298}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.19549763033175355, "exact_match_stderr,custom-extract": 0.013659054104691959}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10732714138286893, "exact_match_stderr,custom-extract": 0.009948629733788088}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.19926650366748166, "exact_match_stderr,custom-extract": 0.013974945415562682}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13648293963254593, "exact_match_stderr,custom-extract": 0.017610952544682614}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11716621253405994, "exact_match_stderr,custom-extract": 0.009697154745523068}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.11621021465581051, "exact_match_stderr,custom-extract": 0.00872227462116445}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.14935064935064934, "exact_match_stderr,custom-extract": 0.011732160468660276}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13627254509018036, "exact_match_stderr,custom-extract": 0.015373681322287381}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12779060816012316, "exact_match_stderr,custom-extract": 0.009266644485474497}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.21428571428571427, "exact_match_stderr,custom-extract": 0.014534489201025444}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.27, "acc_stderr,none": 0.01987435483128749, "acc_norm,none": 0.388, "acc_norm_stderr,none": 0.021814300984787635}, "piqa": {"alias": "piqa", "acc,none": 0.6936887921653971, "acc_stderr,none": 0.010754970032367321, "acc_norm,none": 0.6980413492927094, "acc_norm_stderr,none": 0.010711732891588346}, "race": {"alias": "race", "acc,none": 0.3617224880382775, "acc_stderr,none": 0.014871072026717747}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.42067553735926305, "acc_stderr,none": 0.011170778517705614}, "winogrande": {"alias": "winogrande", "acc,none": 0.6393054459352802, "acc_stderr,none": 0.013496064394234033}} {"created_at": "2025-04-28T07:08:18.738056", "global_step": 362000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.36006825938566556, "acc_stderr,none": 0.014027516814585186, "acc_norm,none": 0.4129692832764505, "acc_norm_stderr,none": 0.014388344935398326}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6898148148148148, "acc_stderr,none": 0.009491721291998515, "acc_norm,none": 0.6835016835016835, "acc_norm_stderr,none": 0.009543851857323891}, "boolq": {"alias": "boolq", "acc,none": 0.7382262996941896, "acc_stderr,none": 0.007688653730439844}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2285012285012285, "acc_stderr,none": 0.012020761312005539}, "copa": {"alias": "copa", "acc,none": 0.69, "acc_stderr,none": 0.04648231987117316}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.43845847440748853, "acc_stderr,none": 0.004951840978219681, "acc_norm,none": 0.5839474208325035, "acc_norm_stderr,none": 0.004918951019183904}, "mmlu": {"acc,none": 0.27182737501780374, "acc_stderr,none": 0.0037484587676164887, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2682252922422954, "acc_stderr,none": 0.006454725249176063, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.038095238095238106}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3151515151515151, "acc_stderr,none": 0.0362773057502241}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.031980016601150726}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.270042194092827, "acc_stderr,none": 0.028900721906293426}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.23140495867768596, "acc_stderr,none": 0.03849856098794088}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3312883435582822, "acc_stderr,none": 0.03697983910025588}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.023532925431044287}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24804469273743016, "acc_stderr,none": 0.01444415780826146}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2958199356913183, "acc_stderr,none": 0.025922371788818777}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.31790123456790126, "acc_stderr,none": 0.025910063528240865}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2653194263363755, "acc_stderr,none": 0.011276198843958873}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.19883040935672514, "acc_stderr,none": 0.030611116557432528}, "mmlu_other": {"acc,none": 0.26295461860315417, "acc_stderr,none": 0.007886749202197592, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899105}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.28901734104046245, "acc_stderr,none": 0.03456425745087001}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.21076233183856502, "acc_stderr,none": 0.027373095500540193}, "mmlu_management": {"alias": " - management", "acc,none": 0.27184466019417475, "acc_stderr,none": 0.044052680241409216}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623101}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28607918263090676, "acc_stderr,none": 0.01616087140512753}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.30718954248366015, "acc_stderr,none": 0.026415601914389}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21323529411764705, "acc_stderr,none": 0.024880971512294285}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.03329394119073531}, "mmlu_social_sciences": {"acc,none": 0.2853428664283393, "acc_stderr,none": 0.008110999532419168, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.19298245614035087, "acc_stderr,none": 0.03712454853721368}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.29292929292929293, "acc_stderr,none": 0.03242497958178815}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2849740932642487, "acc_stderr,none": 0.03257714077709659}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2846153846153846, "acc_stderr,none": 0.022878322799706297}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2815126050420168, "acc_stderr,none": 0.02921354941437216}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27706422018348625, "acc_stderr,none": 0.01918848259016953}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306086}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.27124183006535946, "acc_stderr,none": 0.01798661530403031}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.19090909090909092, "acc_stderr,none": 0.03764425585984926}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.4, "acc_stderr,none": 0.03136250240935893}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.36318407960199006, "acc_stderr,none": 0.034005985055990146}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_stem": {"acc,none": 0.272756105296543, "acc_stderr,none": 0.007928359646207219, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3037037037037037, "acc_stderr,none": 0.03972552884785136}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.037455547914624555}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.19148936170212766, "acc_stderr,none": 0.025722149992637805}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03855289616378947}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525218}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3161290322580645, "acc_stderr,none": 0.02645087448904277}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.26108374384236455, "acc_stderr,none": 0.030903796952114485}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389023}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02988691054762696}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755805}, "mmlu_pro": {"exact_match,custom-extract": 0.13771609042553193, "exact_match_stderr,custom-extract": 0.0031277699073629698, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.2217573221757322, "exact_match_stderr,custom-extract": 0.015525299781071858}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10139416983523447, "exact_match_stderr,custom-extract": 0.010752959229023352}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.09628975265017668, "exact_match_stderr,custom-extract": 0.008771489271271223}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.14878048780487804, "exact_match_stderr,custom-extract": 0.017596736073033845}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17772511848341233, "exact_match_stderr,custom-extract": 0.013166463235967815}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10319917440660474, "exact_match_stderr,custom-extract": 0.009777963967387046}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.16136919315403422, "exact_match_stderr,custom-extract": 0.01287018209245067}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13123359580052493, "exact_match_stderr,custom-extract": 0.01732136945584154}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11989100817438691, "exact_match_stderr,custom-extract": 0.009794114853194124}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.12213175425610659, "exact_match_stderr,custom-extract": 0.008911731297001183}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13636363636363635, "exact_match_stderr,custom-extract": 0.011295719428226613}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15030060120240482, "exact_match_stderr,custom-extract": 0.01601394538357726}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12779060816012316, "exact_match_stderr,custom-extract": 0.009266644485474505}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.19047619047619047, "exact_match_stderr,custom-extract": 0.01390932327432391}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.266, "acc_stderr,none": 0.019780559675655486, "acc_norm,none": 0.376, "acc_norm_stderr,none": 0.02168382753928612}, "piqa": {"alias": "piqa", "acc,none": 0.7007616974972797, "acc_stderr,none": 0.010684130673134581, "acc_norm,none": 0.7040261153427638, "acc_norm_stderr,none": 0.010650414317148126}, "race": {"alias": "race", "acc,none": 0.3588516746411483, "acc_stderr,none": 0.014845215125262313}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43807574206755373, "acc_stderr,none": 0.01122696506802993}, "winogrande": {"alias": "winogrande", "acc,none": 0.6235201262825573, "acc_stderr,none": 0.013616931960667185}} {"created_at": "2025-04-28T09:09:01.054124", "global_step": 364000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.36177474402730375, "acc_stderr,none": 0.014041957945038071, "acc_norm,none": 0.40784982935153585, "acc_norm_stderr,none": 0.014361097288449696}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6763468013468014, "acc_stderr,none": 0.00960047818227378, "acc_norm,none": 0.6544612794612794, "acc_norm_stderr,none": 0.0097579487306703}, "boolq": {"alias": "boolq", "acc,none": 0.5495412844036697, "acc_stderr,none": 0.008702022442950883}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19901719901719903, "acc_stderr,none": 0.011430809442838375}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4355706034654451, "acc_stderr,none": 0.004948181367024938, "acc_norm,none": 0.5819557857000598, "acc_norm_stderr,none": 0.004922294797766665}, "mmlu": {"acc,none": 0.2591511180743484, "acc_stderr,none": 0.003692299952817129, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27927736450584484, "acc_stderr,none": 0.006533988951587552, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.033333333333333354}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.296969696969697, "acc_stderr,none": 0.03567969772268049}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.03132179803083292}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29957805907172996, "acc_stderr,none": 0.029818024749753102}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.371900826446281, "acc_stderr,none": 0.04412015806624504}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252628}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.036230899157241474}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2947976878612717, "acc_stderr,none": 0.02454761779480383}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3022508038585209, "acc_stderr,none": 0.026082700695399655}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2993827160493827, "acc_stderr,none": 0.02548311560119547}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2816166883963494, "acc_stderr,none": 0.011487783272786696}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824563}, "mmlu_other": {"acc,none": 0.24299967814612167, "acc_stderr,none": 0.007685020653974862, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.02575755989310674}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.20179372197309417, "acc_stderr,none": 0.02693611191280227}, "mmlu_management": {"alias": " - management", "acc,none": 0.1941747572815534, "acc_stderr,none": 0.03916667762822585}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.26495726495726496, "acc_stderr,none": 0.028911208802749486}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2707535121328225, "acc_stderr,none": 0.01588988836256049}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.0248480182638752}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.031417842916639245}, "mmlu_social_sciences": {"acc,none": 0.23756906077348067, "acc_stderr,none": 0.007676764149375163, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.03775205013583638}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25252525252525254, "acc_stderr,none": 0.030954055470365907}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.22797927461139897, "acc_stderr,none": 0.03027690994517826}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2205128205128205, "acc_stderr,none": 0.02102067268082791}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.02702543349888238}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23119266055045873, "acc_stderr,none": 0.018075750241633156}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2761437908496732, "acc_stderr,none": 0.018087276935663137}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.038950910157241364}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2163265306122449, "acc_stderr,none": 0.02635891633490402}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916718}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_stem": {"acc,none": 0.2660957817951158, "acc_stderr,none": 0.00786470145985221, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.32592592592592595, "acc_stderr,none": 0.040491220417025055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2, "acc_stderr,none": 0.0261488180184245}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25806451612903225, "acc_stderr,none": 0.024892469172462836}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.031947400722655395}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02671924078371216}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389023}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.20833333333333334, "acc_stderr,none": 0.02769691071309394}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952687}, "mmlu_pro": {"exact_match,custom-extract": 0.13472406914893617, "exact_match_stderr,custom-extract": 0.003094262631605881, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.20641562064156208, "exact_match_stderr,custom-extract": 0.015125555172262694}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.09759188846641319, "exact_match_stderr,custom-extract": 0.010571710152486613}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.06978798586572438, "exact_match_stderr,custom-extract": 0.007576175072607249}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.14146341463414633, "exact_match_stderr,custom-extract": 0.01723216394465976}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17061611374407584, "exact_match_stderr,custom-extract": 0.012956092265003962}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.11661506707946337, "exact_match_stderr,custom-extract": 0.01031607874089496}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.17603911980440098, "exact_match_stderr,custom-extract": 0.013324375473773215}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.15223097112860892, "exact_match_stderr,custom-extract": 0.01842886055804924}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12806539509536785, "exact_match_stderr,custom-extract": 0.010075381773702277}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10066617320503331, "exact_match_stderr,custom-extract": 0.008189084640082613}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.13095238095238096, "exact_match_stderr,custom-extract": 0.011103953542030812}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14829659318637275, "exact_match_stderr,custom-extract": 0.0159255744939775}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12702078521939955, "exact_match_stderr,custom-extract": 0.00924276693598881}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.20426065162907267, "exact_match_stderr,custom-extract": 0.01428067096264084}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.26, "acc_stderr,none": 0.019635965529725512, "acc_norm,none": 0.374, "acc_norm_stderr,none": 0.021660710347204484}, "piqa": {"alias": "piqa", "acc,none": 0.6849836779107725, "acc_stderr,none": 0.01083807274624065, "acc_norm,none": 0.6953210010881393, "acc_norm_stderr,none": 0.010738889044325161}, "race": {"alias": "race", "acc,none": 0.36650717703349284, "acc_stderr,none": 0.014912890943719231}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4247697031729785, "acc_stderr,none": 0.011185271257671336}, "winogrande": {"alias": "winogrande", "acc,none": 0.6195737963693765, "acc_stderr,none": 0.013644727908656831}} {"created_at": "2025-04-28T11:04:11.467245", "global_step": 366000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3395904436860068, "acc_stderr,none": 0.013839039762820167, "acc_norm,none": 0.3916382252559727, "acc_norm_stderr,none": 0.01426412212493822}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6611952861952862, "acc_stderr,none": 0.009711980224301631, "acc_norm,none": 0.6346801346801347, "acc_norm_stderr,none": 0.009880576614806924}, "boolq": {"alias": "boolq", "acc,none": 0.7048929663608563, "acc_stderr,none": 0.00797707928526976}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19574119574119575, "acc_stderr,none": 0.011359497363584395}, "copa": {"alias": "copa", "acc,none": 0.74, "acc_stderr,none": 0.0440844002276808}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.43487353116908983, "acc_stderr,none": 0.004947272454226198, "acc_norm,none": 0.5785700059749054, "acc_norm_stderr,none": 0.004927790036726622}, "mmlu": {"acc,none": 0.2649195271328871, "acc_stderr,none": 0.0037203596269857545, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2760892667375133, "acc_stderr,none": 0.006510322242272694, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.033333333333333354}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624336}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3284313725490196, "acc_stderr,none": 0.03296245110172229}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3037974683544304, "acc_stderr,none": 0.029936696387138605}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04391326286724071}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.036230899157241474}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.29190751445086704, "acc_stderr,none": 0.024476994076247333}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2958199356913183, "acc_stderr,none": 0.025922371788818777}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2993827160493827, "acc_stderr,none": 0.02548311560119547}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26792698826597133, "acc_stderr,none": 0.011311347690633869}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.03446296217088427}, "mmlu_other": {"acc,none": 0.2481493401995494, "acc_stderr,none": 0.007739860107405158, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2339622641509434, "acc_stderr,none": 0.02605529690115292}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.03435568056047874}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2062780269058296, "acc_stderr,none": 0.02715715047956382}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2264957264957265, "acc_stderr,none": 0.02742100729539292}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2720306513409962, "acc_stderr,none": 0.015913367447500517}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.025360603796242553}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20955882352941177, "acc_stderr,none": 0.02472311040767705}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.1746987951807229, "acc_stderr,none": 0.029560326211256847}, "mmlu_social_sciences": {"acc,none": 0.2560935976600585, "acc_stderr,none": 0.007875694449031584, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.25757575757575757, "acc_stderr,none": 0.03115626951964684}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.25906735751295334, "acc_stderr,none": 0.03161877917935411}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.02213908110397153}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24369747899159663, "acc_stderr,none": 0.02788682807838056}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23486238532110093, "acc_stderr,none": 0.018175110510343578}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2761437908496732, "acc_stderr,none": 0.018087276935663137}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878284}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2979591836734694, "acc_stderr,none": 0.02927956741106567}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.2537313432835821, "acc_stderr,none": 0.03076944496729602}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_stem": {"acc,none": 0.27339042182048845, "acc_stderr,none": 0.007931019384985402, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.32592592592592595, "acc_stderr,none": 0.040491220417025055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351586}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2847222222222222, "acc_stderr,none": 0.03773809990686935}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171452}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.20425531914893616, "acc_stderr,none": 0.02635515841334943}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.03752833958003336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27419354838709675, "acc_stderr,none": 0.025378139970885196}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.30049261083743845, "acc_stderr,none": 0.03225799476233485}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763743}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863438}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755806}, "mmlu_pro": {"exact_match,custom-extract": 0.1432845744680851, "exact_match_stderr,custom-extract": 0.0031775330528792793, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.19804741980474197, "exact_match_stderr,custom-extract": 0.014893694032916231}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11913814955640051, "exact_match_stderr,custom-extract": 0.011540276571470732}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.09717314487632508, "exact_match_stderr,custom-extract": 0.008807325782377471}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.15609756097560976, "exact_match_stderr,custom-extract": 0.01794661415125632}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.20497630331753555, "exact_match_stderr,custom-extract": 0.013903626023537346}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.11661506707946337, "exact_match_stderr,custom-extract": 0.01031607874089497}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.19193154034229828, "exact_match_stderr,custom-extract": 0.01377800138301344}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14173228346456693, "exact_match_stderr,custom-extract": 0.01789179783326822}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12170753860127158, "exact_match_stderr,custom-extract": 0.009857844760249224}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10436713545521836, "exact_match_stderr,custom-extract": 0.008321085955307983}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.15476190476190477, "exact_match_stderr,custom-extract": 0.011904761904761823}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1462925851703407, "exact_match_stderr,custom-extract": 0.01583620126390544}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12548113933795227, "exact_match_stderr,custom-extract": 0.009194676853537979}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.20426065162907267, "exact_match_stderr,custom-extract": 0.014280670962640848}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.254, "acc_stderr,none": 0.019486596801643375, "acc_norm,none": 0.36, "acc_norm_stderr,none": 0.02148775108972052}, "piqa": {"alias": "piqa", "acc,none": 0.6860718171926007, "acc_stderr,none": 0.010827928134189646, "acc_norm,none": 0.6926006528835691, "acc_norm_stderr,none": 0.010765602506939063}, "race": {"alias": "race", "acc,none": 0.3598086124401914, "acc_stderr,none": 0.014853898144597874}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.40736949846468784, "acc_stderr,none": 0.011118216651888712}, "winogrande": {"alias": "winogrande", "acc,none": 0.5990528808208366, "acc_stderr,none": 0.013773974554948028}} {"created_at": "2025-04-28T15:17:13.579208", "global_step": 370000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3532423208191126, "acc_stderr,none": 0.013967822714840055, "acc_norm,none": 0.40017064846416384, "acc_norm_stderr,none": 0.014317197787809181}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6616161616161617, "acc_stderr,none": 0.009709034670525096, "acc_norm,none": 0.6418350168350169, "acc_norm_stderr,none": 0.009838331651451848}, "boolq": {"alias": "boolq", "acc,none": 0.7064220183486238, "acc_stderr,none": 0.00796501124942007}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2334152334152334, "acc_stderr,none": 0.012110575321206395}, "copa": {"alias": "copa", "acc,none": 0.76, "acc_stderr,none": 0.04292346959909284}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.44284007169886475, "acc_stderr,none": 0.004957068377516515, "acc_norm,none": 0.5855407289384584, "acc_norm_stderr,none": 0.004916216503770338}, "mmlu": {"acc,none": 0.2870673693206096, "acc_stderr,none": 0.0038078941692647884, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2609989373007439, "acc_stderr,none": 0.00640468137715756, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0404061017820884}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.296969696969697, "acc_stderr,none": 0.035679697722680474}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.03198001660115072}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.31223628691983124, "acc_stderr,none": 0.030165137867847015}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2066115702479339, "acc_stderr,none": 0.03695980128098824}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052192}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25153374233128833, "acc_stderr,none": 0.034089978868575295}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.024105712607754307}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24804469273743016, "acc_stderr,none": 0.014444157808261441}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.27009646302250806, "acc_stderr,none": 0.025218040373410622}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25, "acc_stderr,none": 0.02409347123262133}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2522816166883963, "acc_stderr,none": 0.011092789056875241}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2573099415204678, "acc_stderr,none": 0.03352799844161865}, "mmlu_other": {"acc,none": 0.30157708400386224, "acc_stderr,none": 0.008228405499953175, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.36981132075471695, "acc_stderr,none": 0.029711421880107915}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3583815028901734, "acc_stderr,none": 0.03656343653353159}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.030636591348699803}, "mmlu_management": {"alias": " - management", "acc,none": 0.3106796116504854, "acc_stderr,none": 0.045821241601615506}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02934311479809447}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.28991060025542786, "acc_stderr,none": 0.01622501794477097}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2908496732026144, "acc_stderr,none": 0.026004800363952113}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2730496453900709, "acc_stderr,none": 0.026577860943307854}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3088235294117647, "acc_stderr,none": 0.02806499816704009}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25903614457831325, "acc_stderr,none": 0.03410646614071856}, "mmlu_social_sciences": {"acc,none": 0.3214169645758856, "acc_stderr,none": 0.00838463203153491, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.04227054451232199}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.03318477333845332}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.30569948186528495, "acc_stderr,none": 0.03324837939758159}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.37435897435897436, "acc_stderr,none": 0.024537591572830513}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3277310924369748, "acc_stderr,none": 0.03048991141767323}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.326605504587156, "acc_stderr,none": 0.020106990889937303}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.017704531653250068}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.4448979591836735, "acc_stderr,none": 0.031814251181977865}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03333333333333337}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.2781477957500793, "acc_stderr,none": 0.007972309199952145, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.035914440841969694}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.035541803680256896}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.038009680605548574}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237654}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2936170212765957, "acc_stderr,none": 0.029771642712491227}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.023517294335963286}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.29354838709677417, "acc_stderr,none": 0.025906087021319295}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.030712730070982592}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.02488211685765507}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.032757734861009996}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.042878587513404544}, "mmlu_pro": {"exact_match,custom-extract": 0.13813164893617022, "exact_match_stderr,custom-extract": 0.0031246738058714574, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.22733612273361228, "exact_match_stderr,custom-extract": 0.015662930387246324}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11533586818757921, "exact_match_stderr,custom-extract": 0.01137910999531723}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0706713780918728, "exact_match_stderr,custom-extract": 0.007620353777747201}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13658536585365855, "exact_match_stderr,custom-extract": 0.01698048669306052}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1872037914691943, "exact_match_stderr,custom-extract": 0.013434897809670464}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.11145510835913312, "exact_match_stderr,custom-extract": 0.010114676513406421}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.18948655256723718, "exact_match_stderr,custom-extract": 0.013710657477823645}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12598425196850394, "exact_match_stderr,custom-extract": 0.01702260263856952}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.148047229791099, "exact_match_stderr,custom-extract": 0.010708078833813523}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10436713545521836, "exact_match_stderr,custom-extract": 0.008321085955307983}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1341991341991342, "exact_match_stderr,custom-extract": 0.011219745719880863}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1462925851703407, "exact_match_stderr,custom-extract": 0.01583620126390544}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11624326404926867, "exact_match_stderr,custom-extract": 0.0088963717094982}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.18922305764411027, "exact_match_stderr,custom-extract": 0.013874219527708332}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.286, "acc_stderr,none": 0.020229346329177524, "acc_norm,none": 0.356, "acc_norm_stderr,none": 0.021434712356072635}, "piqa": {"alias": "piqa", "acc,none": 0.6947769314472253, "acc_stderr,none": 0.01074426704560648, "acc_norm,none": 0.7013057671381937, "acc_norm_stderr,none": 0.01067855639814923}, "race": {"alias": "race", "acc,none": 0.369377990430622, "acc_stderr,none": 0.014937221457864277}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.42067553735926305, "acc_stderr,none": 0.011170778517705605}, "winogrande": {"alias": "winogrande", "acc,none": 0.6235201262825573, "acc_stderr,none": 0.01361693196066718}} {"created_at": "2025-04-28T15:37:38.494458", "global_step": 368000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3506825938566553, "acc_stderr,none": 0.013944635930726089, "acc_norm,none": 0.39505119453924914, "acc_norm_stderr,none": 0.01428589829293817}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6851851851851852, "acc_stderr,none": 0.009530150430975612, "acc_norm,none": 0.6721380471380471, "acc_norm_stderr,none": 0.009632587076170011}, "boolq": {"alias": "boolq", "acc,none": 0.654434250764526, "acc_stderr,none": 0.008317463342191586}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20556920556920558, "acc_stderr,none": 0.011569834551534309}, "copa": {"alias": "copa", "acc,none": 0.74, "acc_stderr,none": 0.044084400227680794}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.43467436765584544, "acc_stderr,none": 0.004947010937455325, "acc_norm,none": 0.5849432383987253, "acc_norm_stderr,none": 0.004917248150601854}, "mmlu": {"acc,none": 0.26321036889332006, "acc_stderr,none": 0.003709771583142903, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.26950053134962804, "acc_stderr,none": 0.006464034270060312, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.03893259610604674}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.03427743175816524}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.03213325717373617}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3037974683544304, "acc_stderr,none": 0.0299366963871386}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.32231404958677684, "acc_stderr,none": 0.04266416363352168}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.32407407407407407, "acc_stderr,none": 0.04524596007030048}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22699386503067484, "acc_stderr,none": 0.032910995786157686}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.023786203255508277}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2446927374301676, "acc_stderr,none": 0.014378169884098424}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22186495176848875, "acc_stderr,none": 0.023598858292863047}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.02378858355165854}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2835723598435463, "acc_stderr,none": 0.011511900775968318}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.32748538011695905, "acc_stderr,none": 0.035993357714560276}, "mmlu_other": {"acc,none": 0.2832314129385259, "acc_stderr,none": 0.008068784807058534, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.34, "acc_stderr,none": 0.047609522856952365}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.02619980880756192}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.033687629322594295}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.35874439461883406, "acc_stderr,none": 0.03219079200419996}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.04453254836326467}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.31196581196581197, "acc_stderr,none": 0.030351527323344948}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2848020434227331, "acc_stderr,none": 0.016139174096522553}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2875816993464052, "acc_stderr,none": 0.02591780611714716}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.32978723404255317, "acc_stderr,none": 0.028045946942042398}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21323529411764705, "acc_stderr,none": 0.024880971512294254}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.03384429155233133}, "mmlu_social_sciences": {"acc,none": 0.25999350016249595, "acc_stderr,none": 0.007900829731074739, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.038351539543994194}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.029126522834586815}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24352331606217617, "acc_stderr,none": 0.03097543638684542}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.021444547301560486}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.026653531596715477}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26238532110091745, "acc_stderr,none": 0.018861885021534738}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3053435114503817, "acc_stderr,none": 0.04039314978724561}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.272875816993464, "acc_stderr,none": 0.018020474148393577}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644287}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2653061224489796, "acc_stderr,none": 0.028263889943784593}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.31840796019900497, "acc_stderr,none": 0.03294118479054095}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_stem": {"acc,none": 0.23723437995559785, "acc_stderr,none": 0.007565860674621111, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.037498507091740206}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.19078947368421054, "acc_stderr,none": 0.031975658210325}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2847222222222222, "acc_stderr,none": 0.03773809990686935}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.19, "acc_stderr,none": 0.03942772444036623}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.029379170464124818}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21957671957671956, "acc_stderr,none": 0.021320018599770344}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23225806451612904, "acc_stderr,none": 0.02402225613030824}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.19704433497536947, "acc_stderr,none": 0.02798672466673622}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.1925925925925926, "acc_stderr,none": 0.024043075181945192}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19907407407407407, "acc_stderr,none": 0.02723229846269024}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.044328040552915185}, "mmlu_pro": {"exact_match,custom-extract": 0.13971077127659576, "exact_match_stderr,custom-extract": 0.0031439077903561824, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.20781032078103207, "exact_match_stderr,custom-extract": 0.015163226952637904}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.1229404309252218, "exact_match_stderr,custom-extract": 0.01169765483509599}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.08215547703180212, "exact_match_stderr,custom-extract": 0.008165288212152419}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.15365853658536585, "exact_match_stderr,custom-extract": 0.01783156665820722}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.19075829383886256, "exact_match_stderr,custom-extract": 0.013532157875306493}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.12693498452012383, "exact_match_stderr,custom-extract": 0.010699815313971536}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1821515892420538, "exact_match_stderr,custom-extract": 0.013503360467495373}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13123359580052493, "exact_match_stderr,custom-extract": 0.01732136945584154}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1444141689373297, "exact_match_stderr,custom-extract": 0.010598401112152002}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.09622501850481126, "exact_match_stderr,custom-extract": 0.008026150053444925}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.12337662337662338, "exact_match_stderr,custom-extract": 0.010824855641262714}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15030060120240482, "exact_match_stderr,custom-extract": 0.01601394538357726}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12471131639722864, "exact_match_stderr,custom-extract": 0.009170462563913077}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.19548872180451127, "exact_match_stderr,custom-extract": 0.01404745861054571}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.266, "acc_stderr,none": 0.019780559675655486, "acc_norm,none": 0.368, "acc_norm_stderr,none": 0.021588982568353544}, "piqa": {"alias": "piqa", "acc,none": 0.6893362350380848, "acc_stderr,none": 0.010797078933727673, "acc_norm,none": 0.6996735582154516, "acc_norm_stderr,none": 0.010695225308183138}, "race": {"alias": "race", "acc,none": 0.35406698564593303, "acc_stderr,none": 0.014800834711677317}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.41914022517911975, "acc_stderr,none": 0.01116514070817033}, "winogrande": {"alias": "winogrande", "acc,none": 0.6156274664561957, "acc_stderr,none": 0.01367156760083619}} {"created_at": "2025-04-28T17:00:30.231907", "global_step": 372000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3455631399317406, "acc_stderr,none": 0.013896938461145682, "acc_norm,none": 0.4052901023890785, "acc_norm_stderr,none": 0.014346869060229321}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6590909090909091, "acc_stderr,none": 0.009726579593424019, "acc_norm,none": 0.6359427609427609, "acc_norm_stderr,none": 0.00987329339277912}, "boolq": {"alias": "boolq", "acc,none": 0.6859327217125383, "acc_stderr,none": 0.008117917728841498}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2375102375102375, "acc_stderr,none": 0.012183673723473462}, "copa": {"alias": "copa", "acc,none": 0.76, "acc_stderr,none": 0.04292346959909283}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4387572196773551, "acc_stderr,none": 0.004952209831856592, "acc_norm,none": 0.583150766779526, "acc_norm_stderr,none": 0.004920298437884911}, "mmlu": {"acc,none": 0.27111522575131747, "acc_stderr,none": 0.003741442047081501, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2682252922422954, "acc_stderr,none": 0.0064455389855564565, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.03970158273235173}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2787878787878788, "acc_stderr,none": 0.03501438706296781}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.03213325717373616}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.37130801687763715, "acc_stderr,none": 0.03145068600744859}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04065578140908705}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854932}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.03259177392742178}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3063583815028902, "acc_stderr,none": 0.024818350129436596}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23687150837988827, "acc_stderr,none": 0.014219570788103986}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.24115755627009647, "acc_stderr,none": 0.024296594034763426}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.024383665531035454}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25684485006518903, "acc_stderr,none": 0.011158455853098846}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3567251461988304, "acc_stderr,none": 0.03674013002860954}, "mmlu_other": {"acc,none": 0.29417444480205984, "acc_stderr,none": 0.00816686899636443, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.29056603773584905, "acc_stderr,none": 0.027943219989337152}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.034355680560478746}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3632286995515695, "acc_stderr,none": 0.032277904428505}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.044532548363264673}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.030572811310299607}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3052362707535121, "acc_stderr,none": 0.01646771194763511}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.02633661346904663}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2624113475177305, "acc_stderr,none": 0.026244920349843}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.22426470588235295, "acc_stderr,none": 0.025336848563332383}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3072289156626506, "acc_stderr,none": 0.035915667978246614}, "mmlu_social_sciences": {"acc,none": 0.2765680857978551, "acc_stderr,none": 0.008062328687974657, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436716}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21717171717171718, "acc_stderr,none": 0.029376616484945633}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.23834196891191708, "acc_stderr,none": 0.03074890536390991}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2512820512820513, "acc_stderr,none": 0.021992016662370578}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2605042016806723, "acc_stderr,none": 0.02851025151234193}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.28256880733944956, "acc_stderr,none": 0.01930424349770715}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467765}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2973856209150327, "acc_stderr,none": 0.018492596536396958}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.27755102040816326, "acc_stderr,none": 0.02866685779027465}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3034825870646766, "acc_stderr,none": 0.032510068164586174}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_stem": {"acc,none": 0.24738344433872503, "acc_stderr,none": 0.007645633874797977, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.039154506304142495}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.0315469804508223}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036843}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.27450980392156865, "acc_stderr,none": 0.04440521906179325}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2936170212765957, "acc_stderr,none": 0.029771642712491227}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2328042328042328, "acc_stderr,none": 0.021765961672154523}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.267741935483871, "acc_stderr,none": 0.025189006660212374}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1625615763546798, "acc_stderr,none": 0.025960300064605576}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.1814814814814815, "acc_stderr,none": 0.023499264669407317}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18055555555555555, "acc_stderr,none": 0.02623287897149166}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.36607142857142855, "acc_stderr,none": 0.0457237235873743}, "mmlu_pro": {"exact_match,custom-extract": 0.13597074468085107, "exact_match_stderr,custom-extract": 0.0031043865134485166, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.22733612273361228, "exact_match_stderr,custom-extract": 0.01566293038724634}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.11280101394169835, "exact_match_stderr,custom-extract": 0.011269480888070114}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.08745583038869258, "exact_match_stderr,custom-extract": 0.008400207784455766}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.11463414634146342, "exact_match_stderr,custom-extract": 0.01575276269742974}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17298578199052134, "exact_match_stderr,custom-extract": 0.013027104749292898}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10939112487100103, "exact_match_stderr,custom-extract": 0.010032216012957403}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1784841075794621, "exact_match_stderr,custom-extract": 0.013396666080859064}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16272965879265092, "exact_match_stderr,custom-extract": 0.01893539688282779}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.09900090826521345, "exact_match_stderr,custom-extract": 0.009005035380672342}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10066617320503331, "exact_match_stderr,custom-extract": 0.008189084640082615}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.14393939393939395, "exact_match_stderr,custom-extract": 0.011554236174546458}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1282565130260521, "exact_match_stderr,custom-extract": 0.01498371136300156}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.13240954580446498, "exact_match_stderr,custom-extract": 0.00940761821181917}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.20551378446115287, "exact_match_stderr,custom-extract": 0.014313126213532989}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.256, "acc_stderr,none": 0.019536923574747605, "acc_norm,none": 0.364, "acc_norm_stderr,none": 0.02153917063731769}, "piqa": {"alias": "piqa", "acc,none": 0.6942328618063112, "acc_stderr,none": 0.010749627366141642, "acc_norm,none": 0.6958650707290533, "acc_norm_stderr,none": 0.010733493335721314}, "race": {"alias": "race", "acc,none": 0.3684210526315789, "acc_stderr,none": 0.014929174445557296}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4263050153531218, "acc_stderr,none": 0.011190503463264756}, "winogrande": {"alias": "winogrande", "acc,none": 0.6274664561957379, "acc_stderr,none": 0.013588173888522457}} {"created_at": "2025-04-28T18:00:14.679977", "global_step": 374000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.35665529010238906, "acc_stderr,none": 0.013998056902620199, "acc_norm,none": 0.39590443686006827, "acc_norm_stderr,none": 0.014291228393536588}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6763468013468014, "acc_stderr,none": 0.009600478182273761, "acc_norm,none": 0.6565656565656566, "acc_norm_stderr,none": 0.009743817368960007}, "boolq": {"alias": "boolq", "acc,none": 0.7051987767584098, "acc_stderr,none": 0.007974674313851815}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20802620802620803, "acc_stderr,none": 0.011620759575652367}, "copa": {"alias": "copa", "acc,none": 0.78, "acc_stderr,none": 0.04163331998932261}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4403505277833101, "acc_stderr,none": 0.004954146286513353, "acc_norm,none": 0.5850428201553476, "acc_norm_stderr,none": 0.0049170767266237935}, "mmlu": {"acc,none": 0.28250961401509755, "acc_stderr,none": 0.00379075856559546, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27545164718384696, "acc_stderr,none": 0.006509153435560764, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03718489006818116}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.34545454545454546, "acc_stderr,none": 0.03713158067481912}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.032133257173736156}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.030685820596610795}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.34710743801652894, "acc_stderr,none": 0.043457245702925335}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.03462419931615623}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.023083658586984204}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2558659217877095, "acc_stderr,none": 0.014593620923210746}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2733118971061093, "acc_stderr,none": 0.02531176597542612}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.27469135802469136, "acc_stderr,none": 0.024836057868294677}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2796610169491525, "acc_stderr,none": 0.011463397393861966}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.032744852119469564}, "mmlu_other": {"acc,none": 0.29578371419375604, "acc_stderr,none": 0.008183240358413063, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3584905660377358, "acc_stderr,none": 0.029514703583981762}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3236994219653179, "acc_stderr,none": 0.035676037996391706}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.25112107623318386, "acc_stderr,none": 0.029105220833224622}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.045416094465039476}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24786324786324787, "acc_stderr,none": 0.028286324075564397}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.280970625798212, "acc_stderr,none": 0.016073127851221235}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.026992544339297236}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880585}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.02841820861940679}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.035509201856896294}, "mmlu_social_sciences": {"acc,none": 0.2983425414364641, "acc_stderr,none": 0.008238581720451138, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.038351539543994194}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3282828282828283, "acc_stderr,none": 0.033456784227567746}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.25906735751295334, "acc_stderr,none": 0.03161877917935409}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.023661296393964283}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2815126050420168, "acc_stderr,none": 0.029213549414372163}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3431192660550459, "acc_stderr,none": 0.02035477773608604}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768361}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.01770453165325007}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2979591836734694, "acc_stderr,none": 0.02927956741106568}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03333333333333334}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.26450999048525214, "acc_stderr,none": 0.007809882323298409, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.038201699145179055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.35526315789473684, "acc_stderr,none": 0.03894734487013317}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.039812405437178615}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364395}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23829787234042554, "acc_stderr,none": 0.0278512529738898}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.35172413793103446, "acc_stderr,none": 0.039792366374974096}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.21693121693121692, "acc_stderr,none": 0.021227082449445055}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.31290322580645163, "acc_stderr,none": 0.026377567028645854}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.15763546798029557, "acc_stderr,none": 0.025639014131172404}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22592592592592592, "acc_stderr,none": 0.025497532639609542}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.036313298039696545}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422266}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.23214285714285715, "acc_stderr,none": 0.04007341809755807}, "mmlu_pro": {"exact_match,custom-extract": 0.14311835106382978, "exact_match_stderr,custom-extract": 0.0031642904485157273, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.27615062761506276, "exact_match_stderr,custom-extract": 0.016708620967658634}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.1267427122940431, "exact_match_stderr,custom-extract": 0.011851395705593078}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.09717314487632508, "exact_match_stderr,custom-extract": 0.008807325782377471}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.15853658536585366, "exact_match_stderr,custom-extract": 0.01806012347189305}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.18246445497630331, "exact_match_stderr,custom-extract": 0.013302359232350739}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10526315789473684, "exact_match_stderr,custom-extract": 0.009863889056501646}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.19926650366748166, "exact_match_stderr,custom-extract": 0.013974945415562682}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13648293963254593, "exact_match_stderr,custom-extract": 0.017610952544682618}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12443233424159855, "exact_match_stderr,custom-extract": 0.009952109344107885}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.0917838638045892, "exact_match_stderr,custom-extract": 0.007857979485361455}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1461038961038961, "exact_match_stderr,custom-extract": 0.011626060111830464}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.16633266533066132, "exact_match_stderr,custom-extract": 0.01668670139852614}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11393379522709776, "exact_match_stderr,custom-extract": 0.008819054413543085}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.18922305764411027, "exact_match_stderr,custom-extract": 0.01387421952770834}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.276, "acc_stderr,none": 0.02001121929807354, "acc_norm,none": 0.392, "acc_norm_stderr,none": 0.021854684955611263}, "piqa": {"alias": "piqa", "acc,none": 0.7040261153427638, "acc_stderr,none": 0.010650414317148131, "acc_norm,none": 0.7040261153427638, "acc_norm_stderr,none": 0.010650414317148126}, "race": {"alias": "race", "acc,none": 0.36650717703349284, "acc_stderr,none": 0.014912890943719231}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43551688843398156, "acc_stderr,none": 0.011219586604022605}, "winogrande": {"alias": "winogrande", "acc,none": 0.6322020520915549, "acc_stderr,none": 0.013552385559833596}} {"created_at": "2025-04-28T20:00:14.824806", "global_step": 376000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3250853242320819, "acc_stderr,none": 0.013688147309729125, "acc_norm,none": 0.3822525597269625, "acc_norm_stderr,none": 0.014200454049979279}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6418350168350169, "acc_stderr,none": 0.009838331651451848, "acc_norm,none": 0.6022727272727273, "acc_norm_stderr,none": 0.010042861602178063}, "boolq": {"alias": "boolq", "acc,none": 0.6951070336391437, "acc_stderr,none": 0.008051783411024624}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20475020475020475, "acc_stderr,none": 0.01155271447787667}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4386576379207329, "acc_stderr,none": 0.004952087083128892, "acc_norm,none": 0.5838478390758813, "acc_norm_stderr,none": 0.004919120169394329}, "mmlu": {"acc,none": 0.2956843754450933, "acc_stderr,none": 0.0038438549896058724, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2928799149840595, "acc_stderr,none": 0.006619705407020116, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848876}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624336}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.36764705882352944, "acc_stderr,none": 0.03384132045674119}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3628691983122363, "acc_stderr,none": 0.03129920825530213}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4049586776859504, "acc_stderr,none": 0.04481137755942469}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.35185185185185186, "acc_stderr,none": 0.04616631111801713}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.036230899157241474}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.024257901705323374}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.014310999547961459}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.31511254019292606, "acc_stderr,none": 0.026385273703464492}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.30246913580246915, "acc_stderr,none": 0.025557653981868052}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.288135593220339, "acc_stderr,none": 0.011567140661324565}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03377310252209195}, "mmlu_other": {"acc,none": 0.28838107499195365, "acc_stderr,none": 0.008115950942020542, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27169811320754716, "acc_stderr,none": 0.027377706624670713}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.30057803468208094, "acc_stderr,none": 0.0349610148119118}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.34977578475336324, "acc_stderr,none": 0.03200736719484504}, "mmlu_management": {"alias": " - management", "acc,none": 0.30097087378640774, "acc_stderr,none": 0.045416094465039476}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.02948036054954119}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.29118773946360155, "acc_stderr,none": 0.016246087069701393}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.02656892101545715}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880585}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.20220588235294118, "acc_stderr,none": 0.02439819298665492}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.035509201856896294}, "mmlu_social_sciences": {"acc,none": 0.31556711082222944, "acc_stderr,none": 0.00837547228443515, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.0414243971948936}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3282828282828283, "acc_stderr,none": 0.03345678422756776}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.2849740932642487, "acc_stderr,none": 0.0325771407770966}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.35128205128205126, "acc_stderr,none": 0.024203665177902796}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2815126050420168, "acc_stderr,none": 0.029213549414372174}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.326605504587156, "acc_stderr,none": 0.0201069908899373}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768362}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.315359477124183, "acc_stderr,none": 0.018798086284886887}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.23636363636363636, "acc_stderr,none": 0.04069306319721377}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879815}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3482587064676617, "acc_stderr,none": 0.033687874661154596}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001975}, "mmlu_stem": {"acc,none": 0.287662543609261, "acc_stderr,none": 0.008043884162928115, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.362962962962963, "acc_stderr,none": 0.04153948404742398}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.34868421052631576, "acc_stderr,none": 0.038781398887976125}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3194444444444444, "acc_stderr,none": 0.038990736873573344}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.040925639582376556}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145632}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.28085106382978725, "acc_stderr,none": 0.02937917046412482}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.037800192304380135}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2275132275132275, "acc_stderr,none": 0.021591269407823774}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.31290322580645163, "acc_stderr,none": 0.02637756702864586}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.03127090713297697}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.025644108639267634}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.038020397601079024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03099866630456053}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3482142857142857, "acc_stderr,none": 0.045218299028335865}, "mmlu_pro": {"exact_match,custom-extract": 0.13696808510638298, "exact_match_stderr,custom-extract": 0.003110708408763237, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.24407252440725244, "exact_match_stderr,custom-extract": 0.016052511909317394}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12547528517110265, "exact_match_stderr,custom-extract": 0.011800544058606074}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.08303886925795052, "exact_match_stderr,custom-extract": 0.00820511881432122}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13658536585365855, "exact_match_stderr,custom-extract": 0.01698048669306053}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.18364928909952608, "exact_match_stderr,custom-extract": 0.013335804681083531}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.11042311661506708, "exact_match_stderr,custom-extract": 0.010073585248785344}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1821515892420538, "exact_match_stderr,custom-extract": 0.01350336046749538}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14960629921259844, "exact_match_stderr,custom-extract": 0.018297559115940505}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10263396911898275, "exact_match_stderr,custom-extract": 0.009150272599112007}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10214655810510732, "exact_match_stderr,custom-extract": 0.008242286574704116}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1396103896103896, "exact_match_stderr,custom-extract": 0.011407897167896813}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12024048096192384, "exact_match_stderr,custom-extract": 0.014574466566662003}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.11778290993071594, "exact_match_stderr,custom-extract": 0.008947290267287305}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.20426065162907267, "exact_match_stderr,custom-extract": 0.014280670962640839}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.274, "acc_stderr,none": 0.019966103540279462, "acc_norm,none": 0.374, "acc_norm_stderr,none": 0.02166071034720448}, "piqa": {"alias": "piqa", "acc,none": 0.7018498367791077, "acc_stderr,none": 0.010672964114008301, "acc_norm,none": 0.7002176278563657, "acc_norm_stderr,none": 0.01068968696713809}, "race": {"alias": "race", "acc,none": 0.3770334928229665, "acc_stderr,none": 0.01499933708984336}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43654042988741043, "acc_stderr,none": 0.011222574420844783}, "winogrande": {"alias": "winogrande", "acc,none": 0.6187845303867403, "acc_stderr,none": 0.013650172164160307}} {"created_at": "2025-04-28T21:31:33.535331", "global_step": 378000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3626279863481229, "acc_stderr,none": 0.014049106564955007, "acc_norm,none": 0.4035836177474403, "acc_norm_stderr,none": 0.014337158914268445}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6734006734006734, "acc_stderr,none": 0.009623047038267635, "acc_norm,none": 0.656986531986532, "acc_norm_stderr,none": 0.00974096566648923}, "boolq": {"alias": "boolq", "acc,none": 0.6807339449541284, "acc_stderr,none": 0.00815375422152046}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21457821457821458, "acc_stderr,none": 0.011753423094216842}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4373630750846445, "acc_stderr,none": 0.004950472918523324, "acc_norm,none": 0.5827524397530373, "acc_norm_stderr,none": 0.004920967192255302}, "mmlu": {"acc,none": 0.27902008260931493, "acc_stderr,none": 0.0037754914938657735, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27885228480340063, "acc_stderr,none": 0.00651368409505032, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.0361960452412425}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3515151515151515, "acc_stderr,none": 0.037282069986826503}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3627450980392157, "acc_stderr,none": 0.03374499356319355}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4050632911392405, "acc_stderr,none": 0.031955147413706725}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.30578512396694213, "acc_stderr,none": 0.042059539338841226}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3425925925925926, "acc_stderr,none": 0.045879047413018105}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.27607361963190186, "acc_stderr,none": 0.0351238528370505}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24277456647398843, "acc_stderr,none": 0.023083658586984204}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2829581993569132, "acc_stderr,none": 0.025583062489984827}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2839506172839506, "acc_stderr,none": 0.025089478523765137}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2737940026075619, "acc_stderr,none": 0.0113886121679794}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03188578017686398}, "mmlu_other": {"acc,none": 0.30286449951721917, "acc_stderr,none": 0.008225358794313043, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3283018867924528, "acc_stderr,none": 0.028901593612411784}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2658959537572254, "acc_stderr,none": 0.033687629322594316}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621505}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.39461883408071746, "acc_stderr,none": 0.03280400504755291}, "mmlu_management": {"alias": " - management", "acc,none": 0.24271844660194175, "acc_stderr,none": 0.04245022486384495}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3247863247863248, "acc_stderr,none": 0.03067902276549883}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3116219667943806, "acc_stderr,none": 0.016562433867284176}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.024404394928087873}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.29432624113475175, "acc_stderr,none": 0.027187127011503796}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.24632352941176472, "acc_stderr,none": 0.02617343857052}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3072289156626506, "acc_stderr,none": 0.03591566797824664}, "mmlu_social_sciences": {"acc,none": 0.2694182645433864, "acc_stderr,none": 0.007998449971422454, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.037752050135836386}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2474747474747475, "acc_stderr,none": 0.03074630074212451}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.29015544041450775, "acc_stderr,none": 0.03275264467791515}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2692307692307692, "acc_stderr,none": 0.022489389793654817}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.02959732973097809}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.28073394495412846, "acc_stderr,none": 0.019266055045871623}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.25190839694656486, "acc_stderr,none": 0.03807387116306085}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.01795244919698787}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.35454545454545455, "acc_stderr,none": 0.04582004841505416}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22448979591836735, "acc_stderr,none": 0.026711430555538422}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.27860696517412936, "acc_stderr,none": 0.031700561834973086}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.2651443070091976, "acc_stderr,none": 0.007849569578377399, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3037037037037037, "acc_stderr,none": 0.03972552884785136}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.24342105263157895, "acc_stderr,none": 0.034923496688842384}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.03745554791462457}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33191489361702126, "acc_stderr,none": 0.03078373675774564}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.22758620689655173, "acc_stderr,none": 0.03493950380131184}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3064516129032258, "acc_stderr,none": 0.026226485652553873}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.0316185633535861}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.026466117538959916}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2185430463576159, "acc_stderr,none": 0.03374235550425694}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.20833333333333334, "acc_stderr,none": 0.027696910713093933}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.36607142857142855, "acc_stderr,none": 0.0457237235873743}, "mmlu_pro": {"exact_match,custom-extract": 0.1395445478723404, "exact_match_stderr,custom-extract": 0.003136640907719521, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.2301255230125523, "exact_match_stderr,custom-extract": 0.015730257941409635}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.13561470215462612, "exact_match_stderr,custom-extract": 0.012196745567121883}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0706713780918728, "exact_match_stderr,custom-extract": 0.00762035377774721}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13170731707317074, "exact_match_stderr,custom-extract": 0.01672154370034767}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.20260663507109006, "exact_match_stderr,custom-extract": 0.013843609918758566}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.09907120743034056, "exact_match_stderr,custom-extract": 0.009602432935115172}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1784841075794621, "exact_match_stderr,custom-extract": 0.013396666080859067}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14173228346456693, "exact_match_stderr,custom-extract": 0.017891797833268233}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.11444141689373297, "exact_match_stderr,custom-extract": 0.00959851214763325}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.11472982975573649, "exact_match_stderr,custom-extract": 0.008673796038557053}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.14935064935064934, "exact_match_stderr,custom-extract": 0.011732160468660276}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.015561893867712506}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.12317167051578137, "exact_match_stderr,custom-extract": 0.009121690995025526}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.19674185463659147, "exact_match_stderr,custom-extract": 0.014081430918094351}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.256, "acc_stderr,none": 0.01953692357474761, "acc_norm,none": 0.372, "acc_norm_stderr,none": 0.0216371979857224}, "piqa": {"alias": "piqa", "acc,none": 0.7013057671381937, "acc_stderr,none": 0.010678556398149228, "acc_norm,none": 0.6980413492927094, "acc_norm_stderr,none": 0.010711732891588348}, "race": {"alias": "race", "acc,none": 0.3684210526315789, "acc_stderr,none": 0.014929174445557294}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43551688843398156, "acc_stderr,none": 0.0112195866040226}, "winogrande": {"alias": "winogrande", "acc,none": 0.6172059984214681, "acc_stderr,none": 0.013660946109442016}} {"created_at": "2025-04-28T23:32:29.791677", "global_step": 380000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3660409556313993, "acc_stderr,none": 0.014077223108470144, "acc_norm,none": 0.39334470989761094, "acc_norm_stderr,none": 0.014275101465693024}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6683501683501684, "acc_stderr,none": 0.009660733780923964, "acc_norm,none": 0.6405723905723906, "acc_norm_stderr,none": 0.009845958893373757}, "boolq": {"alias": "boolq", "acc,none": 0.6379204892966361, "acc_stderr,none": 0.00840577556824439}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21375921375921375, "acc_stderr,none": 0.011737086112127208}, "copa": {"alias": "copa", "acc,none": 0.72, "acc_stderr,none": 0.04512608598542127}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4373630750846445, "acc_stderr,none": 0.004950472918523324, "acc_norm,none": 0.5876319458275244, "acc_norm_stderr,none": 0.004912547040132877}, "mmlu": {"acc,none": 0.288135593220339, "acc_stderr,none": 0.0038140674282916634, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.27885228480340063, "acc_stderr,none": 0.006528153569345433, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3393939393939394, "acc_stderr,none": 0.03697442205031596}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.0319800166011507}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.29957805907172996, "acc_stderr,none": 0.029818024749753095}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04391326286724071}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.043300437496507437}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2883435582822086, "acc_stderr,none": 0.035590395316173425}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28901734104046245, "acc_stderr,none": 0.02440517393578324}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2446927374301676, "acc_stderr,none": 0.014378169884098416}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2990353697749196, "acc_stderr,none": 0.026003301117885142}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.02563082497562134}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2835723598435463, "acc_stderr,none": 0.011511900775968307}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.17543859649122806, "acc_stderr,none": 0.029170885500727668}, "mmlu_other": {"acc,none": 0.2784036047634374, "acc_stderr,none": 0.008025038020246178, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816508}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2981132075471698, "acc_stderr,none": 0.02815283794249386}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.35260115606936415, "acc_stderr,none": 0.03643037168958548}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.242152466367713, "acc_stderr,none": 0.028751392398694755}, "mmlu_management": {"alias": " - management", "acc,none": 0.3592233009708738, "acc_stderr,none": 0.04750458399041694}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2948717948717949, "acc_stderr,none": 0.029872577708891162}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2656449553001277, "acc_stderr,none": 0.015794302487888705}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02564686309713791}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.24113475177304963, "acc_stderr,none": 0.025518731049537773}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3382352941176471, "acc_stderr,none": 0.028739328513983576}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.035509201856896294}, "mmlu_social_sciences": {"acc,none": 0.30289242768930774, "acc_stderr,none": 0.008266213119105031, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.03835153954399421}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3434343434343434, "acc_stderr,none": 0.03383201223244441}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.35233160621761656, "acc_stderr,none": 0.034474782864143586}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.31025641025641026, "acc_stderr,none": 0.0234546748894043}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.33613445378151263, "acc_stderr,none": 0.030684737115135353}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.30275229357798167, "acc_stderr,none": 0.01969871143475635}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3282442748091603, "acc_stderr,none": 0.04118438565806298}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2679738562091503, "acc_stderr,none": 0.017917974069594722}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.036942843353378024}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2979591836734694, "acc_stderr,none": 0.029279567411065674}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.34328358208955223, "acc_stderr,none": 0.03357379665433431}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_stem": {"acc,none": 0.29717729146844274, "acc_stderr,none": 0.008125444996458796, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.03853254836552003}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3881578947368421, "acc_stderr,none": 0.03965842097512744}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3263888888888889, "acc_stderr,none": 0.03921067198982266}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.04617034827006716}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.25957446808510637, "acc_stderr,none": 0.02865917937429232}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.022182037202948368}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3225806451612903, "acc_stderr,none": 0.02659308451657228}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3251231527093596, "acc_stderr,none": 0.03295797566311271}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.026962424325073828}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2980132450331126, "acc_stderr,none": 0.03734535676787198}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.032468872436376486}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.039523019677025116}, "mmlu_pro": {"exact_match,custom-extract": 0.13597074468085107, "exact_match_stderr,custom-extract": 0.0030977959261621144, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.2301255230125523, "exact_match_stderr,custom-extract": 0.015730257941409635}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.10773130544993663, "exact_match_stderr,custom-extract": 0.011044744671924548}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.060954063604240286, "exact_match_stderr,custom-extract": 0.007113993242445053}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.17073170731707318, "exact_match_stderr,custom-extract": 0.018605569494622855}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.17417061611374407, "exact_match_stderr,custom-extract": 0.013062274992927891}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10835913312693499, "exact_match_stderr,custom-extract": 0.00999056535282795}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.18581907090464547, "exact_match_stderr,custom-extract": 0.013608008648555643}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.12860892388451445, "exact_match_stderr,custom-extract": 0.017173163625244663}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1226158038147139, "exact_match_stderr,custom-extract": 0.009889441958223884}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.10066617320503331, "exact_match_stderr,custom-extract": 0.00818908464008261}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.14935064935064934, "exact_match_stderr,custom-extract": 0.011732160468660283}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14228456913827656, "exact_match_stderr,custom-extract": 0.01565437891978721}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.10931485758275597, "exact_match_stderr,custom-extract": 0.00866092652669347}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.21553884711779447, "exact_match_stderr,custom-extract": 0.014565296774840433}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.254, "acc_stderr,none": 0.019486596801643382, "acc_norm,none": 0.388, "acc_norm_stderr,none": 0.021814300984787635}, "piqa": {"alias": "piqa", "acc,none": 0.6942328618063112, "acc_stderr,none": 0.010749627366141642, "acc_norm,none": 0.6996735582154516, "acc_norm_stderr,none": 0.010695225308183138}, "race": {"alias": "race", "acc,none": 0.3588516746411483, "acc_stderr,none": 0.014845215125262313}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43244626407369496, "acc_stderr,none": 0.011210331273967561}, "winogrande": {"alias": "winogrande", "acc,none": 0.6179952644041041, "acc_stderr,none": 0.013655578215970417}} {"created_at": "2025-04-29T01:35:31.593492", "global_step": 382000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3728668941979522, "acc_stderr,none": 0.01413117676013116, "acc_norm,none": 0.4189419795221843, "acc_norm_stderr,none": 0.014418106953639013}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6986531986531986, "acc_stderr,none": 0.009415259879351615, "acc_norm,none": 0.696969696969697, "acc_norm_stderr,none": 0.009430140669278957}, "boolq": {"alias": "boolq", "acc,none": 0.673394495412844, "acc_stderr,none": 0.008202364612924434}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.24733824733824733, "acc_stderr,none": 0.012352806767465217}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816505}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4400517825134435, "acc_stderr,none": 0.00495378714651092, "acc_norm,none": 0.5889265086636128, "acc_norm_stderr,none": 0.004910229643262754}, "mmlu": {"acc,none": 0.30309072781655033, "acc_stderr,none": 0.0038657810745344394, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.29521785334750267, "acc_stderr,none": 0.006624400683774049, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.15873015873015872, "acc_stderr,none": 0.03268454013011745}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.34545454545454546, "acc_stderr,none": 0.03713158067481912}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.03228210387037894}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2911392405063291, "acc_stderr,none": 0.029571601065753374}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.45454545454545453, "acc_stderr,none": 0.04545454545454546}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.03602511318806771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3554913294797688, "acc_stderr,none": 0.025770292082977257}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.33762057877813506, "acc_stderr,none": 0.026858825879488544}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.31790123456790126, "acc_stderr,none": 0.025910063528240875}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2907431551499348, "acc_stderr,none": 0.011598062372851986}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30409356725146197, "acc_stderr,none": 0.03528211258245231}, "mmlu_other": {"acc,none": 0.32410685548760865, "acc_stderr,none": 0.008385688536148034, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3132075471698113, "acc_stderr,none": 0.028544793319055326}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.32947976878612717, "acc_stderr,none": 0.03583901754736411}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145632}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.33183856502242154, "acc_stderr,none": 0.031602951437766785}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.04354631077260593}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.32051282051282054, "acc_stderr,none": 0.03057281131029961}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.36270753512132825, "acc_stderr,none": 0.017192708674602288}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35947712418300654, "acc_stderr,none": 0.027475969910660952}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.02679956202488768}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.27710843373493976, "acc_stderr,none": 0.034843315926805875}, "mmlu_social_sciences": {"acc,none": 0.3051673708157296, "acc_stderr,none": 0.008273281504431336, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3484848484848485, "acc_stderr,none": 0.033948539651564025}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.29533678756476683, "acc_stderr,none": 0.032922966391551386}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2641025641025641, "acc_stderr,none": 0.022352193737453285}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2773109243697479, "acc_stderr,none": 0.02907937453948001}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3155963302752294, "acc_stderr,none": 0.019926117513869666}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2824427480916031, "acc_stderr,none": 0.03948406125768362}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.018635594034423983}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.040139645540727735}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.02892058322067558}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.42786069651741293, "acc_stderr,none": 0.03498541988407795}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_stem": {"acc,none": 0.2921027592768792, "acc_stderr,none": 0.008075775748436145, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4444444444444444, "acc_stderr,none": 0.04292596718256981}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.34868421052631576, "acc_stderr,none": 0.0387813988879761}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.03852084696008534}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.04793724854411018}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23829787234042554, "acc_stderr,none": 0.02785125297388979}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.32413793103448274, "acc_stderr,none": 0.03900432069185553}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.02286083830923207}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3032258064516129, "acc_stderr,none": 0.02614868593067175}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.30049261083743845, "acc_stderr,none": 0.03225799476233485}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2847682119205298, "acc_stderr,none": 0.03684881521389023}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.028963702570791037}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.26785714285714285, "acc_stderr,none": 0.04203277291467762}, "mmlu_pro": {"exact_match,custom-extract": 0.14012632978723405, "exact_match_stderr,custom-extract": 0.0031453105687170167, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.22454672245467225, "exact_match_stderr,custom-extract": 0.015594615345646805}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12420785804816223, "exact_match_stderr,custom-extract": 0.01174929882599815}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.07597173144876325, "exact_match_stderr,custom-extract": 0.007878387512999479}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.17317073170731706, "exact_match_stderr,custom-extract": 0.01871041898122491}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.1872037914691943, "exact_match_stderr,custom-extract": 0.013434897809670469}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1217750257997936, "exact_match_stderr,custom-extract": 0.01051100706477546}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.1687041564792176, "exact_match_stderr,custom-extract": 0.013101760842249751}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.11811023622047244, "exact_match_stderr,custom-extract": 0.016556141198042423}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.10626702997275204, "exact_match_stderr,custom-extract": 0.009291949023141266}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.12213175425610659, "exact_match_stderr,custom-extract": 0.00891173129700119}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.15584415584415584, "exact_match_stderr,custom-extract": 0.011938663890309691}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1523046092184369, "exact_match_stderr,custom-extract": 0.016101330436514016}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1147036181678214, "exact_match_stderr,custom-extract": 0.008844953561387795}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.20050125313283207, "exact_match_stderr,custom-extract": 0.014182026045901089}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.266, "acc_stderr,none": 0.01978055967565549, "acc_norm,none": 0.37, "acc_norm_stderr,none": 0.021613289165165785}, "piqa": {"alias": "piqa", "acc,none": 0.6936887921653971, "acc_stderr,none": 0.01075497003236732, "acc_norm,none": 0.6920565832426551, "acc_norm_stderr,none": 0.010770892367463685}, "race": {"alias": "race", "acc,none": 0.3684210526315789, "acc_stderr,none": 0.014929174445557294}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43858751279426816, "acc_stderr,none": 0.011228404348646076}, "winogrande": {"alias": "winogrande", "acc,none": 0.6282557221783741, "acc_stderr,none": 0.01358230628499286}} {"created_at": "2025-04-29T23:56:07.758628", "global_step": 390000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.35238907849829354, "acc_stderr,none": 0.013960142600598673, "acc_norm,none": 0.3993174061433447, "acc_norm_stderr,none": 0.0143120945579467}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6481481481481481, "acc_stderr,none": 0.009799078929868706, "acc_norm,none": 0.6216329966329966, "acc_norm_stderr,none": 0.009951575683331949}, "boolq": {"alias": "boolq", "acc,none": 0.7415902140672783, "acc_stderr,none": 0.007656477542140288}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20802620802620803, "acc_stderr,none": 0.01162075957565237}, "copa": {"alias": "copa", "acc,none": 0.73, "acc_stderr,none": 0.044619604333847394}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.44433379804819756, "acc_stderr,none": 0.004958761056959772, "acc_norm,none": 0.5973909579764987, "acc_norm_stderr,none": 0.0048942100113032174}, "mmlu": {"acc,none": 0.28578550064093433, "acc_stderr,none": 0.003803034055750395, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2875664187035069, "acc_stderr,none": 0.006569018657383271, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1746031746031746, "acc_stderr,none": 0.03395490020856111}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3575757575757576, "acc_stderr,none": 0.037425970438065864}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3382352941176471, "acc_stderr,none": 0.03320574612945432}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.350210970464135, "acc_stderr,none": 0.03105239193758435}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.49586776859504134, "acc_stderr,none": 0.045641987674327526}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.03462419931615624}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2861271676300578, "acc_stderr,none": 0.02433214677913413}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24804469273743016, "acc_stderr,none": 0.01444415780826146}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.31189710610932475, "acc_stderr,none": 0.026311858071854155}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3055555555555556, "acc_stderr,none": 0.025630824975621344}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2796610169491525, "acc_stderr,none": 0.011463397393861955}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2573099415204678, "acc_stderr,none": 0.03352799844161865}, "mmlu_other": {"acc,none": 0.28355326681686516, "acc_stderr,none": 0.00808176127996606, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2490566037735849, "acc_stderr,none": 0.026616482980501715}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.26011560693641617, "acc_stderr,none": 0.03345036916788991}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.30493273542600896, "acc_stderr,none": 0.030898610882477515}, "mmlu_management": {"alias": " - management", "acc,none": 0.2621359223300971, "acc_stderr,none": 0.04354631077260597}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2863247863247863, "acc_stderr,none": 0.02961432369045665}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3243933588761175, "acc_stderr,none": 0.0167409290471627}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.30718954248366015, "acc_stderr,none": 0.026415601914388992}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340460994}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.23161764705882354, "acc_stderr,none": 0.025626533803777565}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2469879518072289, "acc_stderr,none": 0.03357351982064536}, "mmlu_social_sciences": {"acc,none": 0.29151771205719856, "acc_stderr,none": 0.008182901571420757, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.20175438596491227, "acc_stderr,none": 0.03775205013583638}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03173071239071724}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.32642487046632124, "acc_stderr,none": 0.033840286211432945}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2743589743589744, "acc_stderr,none": 0.022622765767493214}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.02959732973097808}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27522935779816515, "acc_stderr,none": 0.0191490937431552}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.26717557251908397, "acc_stderr,none": 0.038808483010823944}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3022875816993464, "acc_stderr,none": 0.018579232711113874}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04265792110940588}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2816326530612245, "acc_stderr,none": 0.02879518557429129}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.39800995024875624, "acc_stderr,none": 0.034611994290400135}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_stem": {"acc,none": 0.2797335870599429, "acc_stderr,none": 0.00797346046944964, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.32592592592592595, "acc_stderr,none": 0.040491220417025055}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.32894736842105265, "acc_stderr,none": 0.03823428969926606}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03942082639927213}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001975}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2, "acc_stderr,none": 0.0261488180184245}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309993}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2724867724867725, "acc_stderr,none": 0.02293097307163335}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3, "acc_stderr,none": 0.02606936229533514}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03255086769970103}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02671924078371216}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763743}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422273}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.25, "acc_stderr,none": 0.04109974682633932}, "mmlu_pro": {"exact_match,custom-extract": 0.15450465425531915, "exact_match_stderr,custom-extract": 0.003272176863041575, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.2496513249651325, "exact_match_stderr,custom-extract": 0.01617491423255685}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12040557667934093, "exact_match_stderr,custom-extract": 0.011593149221103034}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.09717314487632508, "exact_match_stderr,custom-extract": 0.008807325782377466}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.13658536585365855, "exact_match_stderr,custom-extract": 0.01698048669306053}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.20734597156398105, "exact_match_stderr,custom-extract": 0.013962907124731166}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.13725490196078433, "exact_match_stderr,custom-extract": 0.01106031519250513}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.19193154034229828, "exact_match_stderr,custom-extract": 0.013778001383013435}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14698162729658792, "exact_match_stderr,custom-extract": 0.018164310621441037}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12352406902815623, "exact_match_stderr,custom-extract": 0.009920862929791524}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.12065136935603257, "exact_match_stderr,custom-extract": 0.008865021419144305}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.15476190476190477, "exact_match_stderr,custom-extract": 0.011904761904761824}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.16432865731462926, "exact_match_stderr,custom-extract": 0.016605797464661214}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.14164742109314857, "exact_match_stderr,custom-extract": 0.00967831711512379}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.23809523809523808, "exact_match_stderr,custom-extract": 0.015086779329233782}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.246, "acc_stderr,none": 0.019279819056352555, "acc_norm,none": 0.37, "acc_norm_stderr,none": 0.021613289165165785}, "piqa": {"alias": "piqa", "acc,none": 0.7029379760609358, "acc_stderr,none": 0.010661725404814786, "acc_norm,none": 0.7094668117519043, "acc_norm_stderr,none": 0.010592765034696534}, "race": {"alias": "race", "acc,none": 0.3712918660287081, "acc_stderr,none": 0.014953126515089408}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43193449334698053, "acc_stderr,none": 0.011208746100567539}, "winogrande": {"alias": "winogrande", "acc,none": 0.6266771902131019, "acc_stderr,none": 0.013594002763035521}} {"created_at": "2025-05-01T17:03:28.871987", "global_step": 450000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.40784982935153585, "acc_stderr,none": 0.01436109728844969, "acc_norm,none": 0.4325938566552901, "acc_norm_stderr,none": 0.014478005694182531}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7112794612794613, "acc_stderr,none": 0.00929880556543552, "acc_norm,none": 0.7045454545454546, "acc_norm_stderr,none": 0.009361987126556452}, "boolq": {"alias": "boolq", "acc,none": 0.7623853211009174, "acc_stderr,none": 0.007444172627575039}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2932022932022932, "acc_stderr,none": 0.013033208167361504}, "copa": {"alias": "copa", "acc,none": 0.78, "acc_stderr,none": 0.04163331998932262}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.46654052977494526, "acc_stderr,none": 0.0049785963940454335, "acc_norm,none": 0.6320454092810197, "acc_norm_stderr,none": 0.004812633280078254}, "mmlu": {"acc,none": 0.3511608033043726, "acc_stderr,none": 0.004000042036806642, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.33156216790648246, "acc_stderr,none": 0.0068154198202006145, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.03809523809523811}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4303030303030303, "acc_stderr,none": 0.03866225962879077}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.4264705882352941, "acc_stderr,none": 0.034711579079534254}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.45147679324894513, "acc_stderr,none": 0.032393600173974704}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4297520661157025, "acc_stderr,none": 0.04519082021319773}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.0471282125742677}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.294478527607362, "acc_stderr,none": 0.03581165790474082}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3063583815028902, "acc_stderr,none": 0.024818350129436593}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3665594855305466, "acc_stderr,none": 0.027368078243971618}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3734567901234568, "acc_stderr,none": 0.02691500301138015}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3272490221642764, "acc_stderr,none": 0.01198381980646474}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.34502923976608185, "acc_stderr,none": 0.036459813773888065}, "mmlu_other": {"acc,none": 0.3685226906984229, "acc_stderr,none": 0.00862758497850723, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.35471698113207545, "acc_stderr,none": 0.029445175328199583}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3699421965317919, "acc_stderr,none": 0.0368122963339432}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.36771300448430494, "acc_stderr,none": 0.03236198350928276}, "mmlu_management": {"alias": " - management", "acc,none": 0.3786407766990291, "acc_stderr,none": 0.04802694698258974}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4230769230769231, "acc_stderr,none": 0.032366121762202014}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4278416347381865, "acc_stderr,none": 0.017692787927803728}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3758169934640523, "acc_stderr,none": 0.027732834353363944}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3049645390070922, "acc_stderr,none": 0.027464708442022135}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.0276784686421447}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.035509201856896294}, "mmlu_social_sciences": {"acc,none": 0.38414039649008774, "acc_stderr,none": 0.008713658939485844, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537315}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3686868686868687, "acc_stderr,none": 0.034373055019806184}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.47668393782383417, "acc_stderr,none": 0.03604513672442205}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.33076923076923076, "acc_stderr,none": 0.023854795680971118}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31932773109243695, "acc_stderr,none": 0.0302839955258844}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.45137614678899085, "acc_stderr,none": 0.02133571471126879}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.33587786259541985, "acc_stderr,none": 0.04142313771996662}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.35784313725490197, "acc_stderr,none": 0.01939305840235543}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2818181818181818, "acc_stderr,none": 0.043091187099464585}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.39591836734693875, "acc_stderr,none": 0.03130802899065685}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5024875621890548, "acc_stderr,none": 0.03535490150137289}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.4, "acc_stderr,none": 0.04923659639173309}, "mmlu_stem": {"acc,none": 0.33111322549952427, "acc_stderr,none": 0.008326286559263792, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4444444444444444, "acc_stderr,none": 0.04292596718256981}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.32894736842105265, "acc_stderr,none": 0.03823428969926605}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.4097222222222222, "acc_stderr,none": 0.04112490974670788}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110175}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3021276595744681, "acc_stderr,none": 0.030017554471880554}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3310344827586207, "acc_stderr,none": 0.039215453124671215}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525218}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.41935483870967744, "acc_stderr,none": 0.028071588901091835}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.33497536945812806, "acc_stderr,none": 0.033208527423483104}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.026202766534652148}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.32450331125827814, "acc_stderr,none": 0.038227469376587525}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.33796296296296297, "acc_stderr,none": 0.03225941352631295}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3392857142857143, "acc_stderr,none": 0.04493949068613539}, "mmlu_pro": {"exact_match,custom-extract": 0.17553191489361702, "exact_match_stderr,custom-extract": 0.003435849675696069, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3291492329149233, "exact_match_stderr,custom-extract": 0.017561146780265928}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.1520912547528517, "exact_match_stderr,custom-extract": 0.01279273989955857}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11925795053003534, "exact_match_stderr,custom-extract": 0.009636886279174636}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.17804878048780487, "exact_match_stderr,custom-extract": 0.018916068139926738}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.23696682464454977, "exact_match_stderr,custom-extract": 0.014645415505512257}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.16305469556243551, "exact_match_stderr,custom-extract": 0.011873466052186893}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2176039119804401, "exact_match_stderr,custom-extract": 0.01443562714447019}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.14960629921259844, "exact_match_stderr,custom-extract": 0.018297559115940484}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.13896457765667575, "exact_match_stderr,custom-extract": 0.010429565658758024}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.13619541080680977, "exact_match_stderr,custom-extract": 0.009335167315129594}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17207792207792208, "exact_match_stderr,custom-extract": 0.012423857401451379}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1523046092184369, "exact_match_stderr,custom-extract": 0.01610133043651402}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.147036181678214, "exact_match_stderr,custom-extract": 0.009829695542065219}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.24060150375939848, "exact_match_stderr,custom-extract": 0.015141010979644542}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.284, "acc_stderr,none": 0.02018670369357085, "acc_norm,none": 0.388, "acc_norm_stderr,none": 0.021814300984787635}, "piqa": {"alias": "piqa", "acc,none": 0.719260065288357, "acc_stderr,none": 0.010484325438311827, "acc_norm,none": 0.7219804134929271, "acc_norm_stderr,none": 0.010453117358332813}, "race": {"alias": "race", "acc,none": 0.37894736842105264, "acc_stderr,none": 0.015014241655133452}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44012282497441146, "acc_stderr,none": 0.011232649450458456}, "winogrande": {"alias": "winogrande", "acc,none": 0.6353591160220995, "acc_stderr,none": 0.01352774662242985}} {"created_at": "2025-05-02T00:17:53.508872", "global_step": 400000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.378839590443686, "acc_stderr,none": 0.01417591549000032, "acc_norm,none": 0.4274744027303754, "acc_norm_stderr,none": 0.014456862944650652}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.6927609427609428, "acc_stderr,none": 0.00946668883247538, "acc_norm,none": 0.678030303030303, "acc_norm_stderr,none": 0.009587386696300387}, "boolq": {"alias": "boolq", "acc,none": 0.7302752293577982, "acc_stderr,none": 0.0077624039763635015}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.26044226044226043, "acc_stderr,none": 0.012564981568550689}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.44961163114917346, "acc_stderr,none": 0.004964378762425233, "acc_norm,none": 0.6021708822943637, "acc_norm_stderr,none": 0.004884495069459696}, "mmlu": {"acc,none": 0.31363053696054694, "acc_stderr,none": 0.0038936205454999694, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.30414452709883105, "acc_stderr,none": 0.006685580248950118, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.040061680838488774}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3696969696969697, "acc_stderr,none": 0.03769430314512568}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.39215686274509803, "acc_stderr,none": 0.034267123492472705}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.43037974683544306, "acc_stderr,none": 0.032230171959375976}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.35537190082644626, "acc_stderr,none": 0.0436923632657398}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.04643454608906274}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2822085889570552, "acc_stderr,none": 0.03536117886664743}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2947976878612717, "acc_stderr,none": 0.02454761779480383}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2659217877094972, "acc_stderr,none": 0.014776765066438886}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3247588424437299, "acc_stderr,none": 0.026596782287697043}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.025407197798890162}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2835723598435463, "acc_stderr,none": 0.011511900775968312}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30994152046783624, "acc_stderr,none": 0.035469769593931624}, "mmlu_other": {"acc,none": 0.3543611200514966, "acc_stderr,none": 0.008544470169028907, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3622641509433962, "acc_stderr,none": 0.0295822451283843}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.0355068398916558}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4080717488789238, "acc_stderr,none": 0.03298574607842822}, "mmlu_management": {"alias": " - management", "acc,none": 0.3106796116504854, "acc_stderr,none": 0.045821241601615506}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4188034188034188, "acc_stderr,none": 0.03232128912157792}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.43, "acc_stderr,none": 0.04975698519562428}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.39719029374201786, "acc_stderr,none": 0.017497905037159367}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3431372549019608, "acc_stderr,none": 0.027184498909941613}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.28368794326241137, "acc_stderr,none": 0.02689170942834396}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2426470588235294, "acc_stderr,none": 0.026040662474201254}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.03664314777288085}, "mmlu_social_sciences": {"acc,none": 0.31199220019499513, "acc_stderr,none": 0.008320375067846709, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30303030303030304, "acc_stderr,none": 0.03274287914026866}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.29015544041450775, "acc_stderr,none": 0.03275264467791515}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2794871794871795, "acc_stderr,none": 0.022752388839776823}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.23109243697478993, "acc_stderr,none": 0.027381406927868966}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3284403669724771, "acc_stderr,none": 0.020135902797298415}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3053435114503817, "acc_stderr,none": 0.04039314978724561}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3366013071895425, "acc_stderr,none": 0.019117213911495158}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.04461272175910508}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.028920583220675592}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.4527363184079602, "acc_stderr,none": 0.03519702717576915}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_stem": {"acc,none": 0.28924833491912466, "acc_stderr,none": 0.008020656626619133, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4, "acc_stderr,none": 0.04232073695151589}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.34868421052631576, "acc_stderr,none": 0.03878139888797611}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3402777777777778, "acc_stderr,none": 0.039621355734862175}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.042801058373643966}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33191489361702126, "acc_stderr,none": 0.030783736757745643}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.25517241379310346, "acc_stderr,none": 0.03632984052707842}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23544973544973544, "acc_stderr,none": 0.021851509822031698}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.36129032258064514, "acc_stderr,none": 0.027327548447957536}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2512315270935961, "acc_stderr,none": 0.030516530732694436}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.025348097468097866}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.304635761589404, "acc_stderr,none": 0.03757949922943343}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18981481481481483, "acc_stderr,none": 0.026744714834691936}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3482142857142857, "acc_stderr,none": 0.04521829902833587}, "mmlu_pro": {"exact_match,custom-extract": 0.15658244680851063, "exact_match_stderr,custom-extract": 0.003289633405261824, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.27615062761506276, "exact_match_stderr,custom-extract": 0.01670862096765863}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.13307984790874525, "exact_match_stderr,custom-extract": 0.012099922493633758}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10424028268551237, "exact_match_stderr,custom-extract": 0.00908619915941718}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.17804878048780487, "exact_match_stderr,custom-extract": 0.01891606813992675}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.18246445497630331, "exact_match_stderr,custom-extract": 0.013302359232350737}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.12280701754385964, "exact_match_stderr,custom-extract": 0.01054924765177266}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2078239608801956, "exact_match_stderr,custom-extract": 0.014195399903279475}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13910761154855644, "exact_match_stderr,custom-extract": 0.017752441192974866}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12897366030881016, "exact_match_stderr,custom-extract": 0.010105779411053322}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.13619541080680977, "exact_match_stderr,custom-extract": 0.009335167315129601}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.15367965367965367, "exact_match_stderr,custom-extract": 0.011870656198155955}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.12024048096192384, "exact_match_stderr,custom-extract": 0.014574466566661991}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1424172440338722, "exact_match_stderr,custom-extract": 0.00970022844693384}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.22681704260651628, "exact_match_stderr,custom-extract": 0.014833711131555816}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.244, "acc_stderr,none": 0.019226734893614594, "acc_norm,none": 0.364, "acc_norm_stderr,none": 0.021539170637317695}, "piqa": {"alias": "piqa", "acc,none": 0.705114254624592, "acc_stderr,none": 0.010639030620156992, "acc_norm,none": 0.704570184983678, "acc_norm_stderr,none": 0.010644731559342462}, "race": {"alias": "race", "acc,none": 0.3722488038277512, "acc_stderr,none": 0.014960984760899331}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4273285568065507, "acc_stderr,none": 0.011193930340551272}, "winogrande": {"alias": "winogrande", "acc,none": 0.6306235201262825, "acc_stderr,none": 0.013564470596053525}} {"created_at": "2025-05-02T00:24:20.005849", "global_step": 430000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3873720136518771, "acc_stderr,none": 0.014235872487909874, "acc_norm,none": 0.42235494880546076, "acc_norm_stderr,none": 0.01443413871337999}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7007575757575758, "acc_stderr,none": 0.009396447162309824, "acc_norm,none": 0.6805555555555556, "acc_norm_stderr,none": 0.009567482017268092}, "boolq": {"alias": "boolq", "acc,none": 0.6608562691131499, "acc_stderr,none": 0.008280145027624473}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.26535626535626533, "acc_stderr,none": 0.012640758880532866}, "copa": {"alias": "copa", "acc,none": 0.75, "acc_stderr,none": 0.04351941398892446}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.46106353316072496, "acc_stderr,none": 0.004974628903829148, "acc_norm,none": 0.6176060545708026, "acc_norm_stderr,none": 0.004849788423944371}, "mmlu": {"acc,none": 0.33228884774248685, "acc_stderr,none": 0.003957978140244812, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.31243358129649307, "acc_stderr,none": 0.006729308761201343, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.038932596106046706}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3939393939393939, "acc_stderr,none": 0.0381549430868893}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3872549019607843, "acc_stderr,none": 0.03418931233833343}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.41350210970464135, "acc_stderr,none": 0.03205649904851859}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.3305785123966942, "acc_stderr,none": 0.04294340845212094}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.04668408033024932}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.294478527607362, "acc_stderr,none": 0.03581165790474082}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3063583815028902, "acc_stderr,none": 0.024818350129436596}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3408360128617363, "acc_stderr,none": 0.026920841260776162}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.35802469135802467, "acc_stderr,none": 0.026675611926037082}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30964797913950454, "acc_stderr,none": 0.011808598262503318}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.034462962170884265}, "mmlu_other": {"acc,none": 0.3685226906984229, "acc_stderr,none": 0.0086424493818874, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.39622641509433965, "acc_stderr,none": 0.030102793781791197}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.32947976878612717, "acc_stderr,none": 0.03583901754736411}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.42152466367713004, "acc_stderr,none": 0.03314190222110658}, "mmlu_management": {"alias": " - management", "acc,none": 0.3592233009708738, "acc_stderr,none": 0.047504583990416946}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4188034188034188, "acc_stderr,none": 0.03232128912157792}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.42, "acc_stderr,none": 0.049604496374885836}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.35759897828863346, "acc_stderr,none": 0.017139488998803288}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3464052287581699, "acc_stderr,none": 0.027245613047215355}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.02699219917306436}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.4227941176470588, "acc_stderr,none": 0.030008562845003476}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3614457831325301, "acc_stderr,none": 0.037400593820293204}, "mmlu_social_sciences": {"acc,none": 0.3555411114722132, "acc_stderr,none": 0.008614120804057579, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04434600701584925}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3282828282828283, "acc_stderr,none": 0.03345678422756776}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.38860103626943004, "acc_stderr,none": 0.035177397963731316}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.32564102564102565, "acc_stderr,none": 0.02375966576741229}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3277310924369748, "acc_stderr,none": 0.03048991141767323}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.4036697247706422, "acc_stderr,none": 0.021035704856574956}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.32061068702290074, "acc_stderr,none": 0.04093329229834278}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3349673202614379, "acc_stderr,none": 0.019094228167000307}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.42727272727272725, "acc_stderr,none": 0.04738198703545483}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2979591836734694, "acc_stderr,none": 0.029279567411065677}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.43283582089552236, "acc_stderr,none": 0.03503490923673282}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_stem": {"acc,none": 0.30352045670789723, "acc_stderr,none": 0.008165579039894389, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.35555555555555557, "acc_stderr,none": 0.04135176749720386}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3223684210526316, "acc_stderr,none": 0.03803510248351585}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.039420826399272135}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364395}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.42, "acc_stderr,none": 0.049604496374885836}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3617021276595745, "acc_stderr,none": 0.031410821975962386}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.037245636197746304}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.022860838309232072}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.36774193548387096, "acc_stderr,none": 0.02743086657997347}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2561576354679803, "acc_stderr,none": 0.030712730070982592}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02534809746809783}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.037101857261199946}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.030851992993257013}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.15699800531914893, "exact_match_stderr,custom-extract": 0.003291478983884699, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.2649930264993027, "exact_match_stderr,custom-extract": 0.016493257569680956}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12547528517110265, "exact_match_stderr,custom-extract": 0.01180054405860609}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.0892226148409894, "exact_match_stderr,custom-extract": 0.008476416539386444}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.17317073170731706, "exact_match_stderr,custom-extract": 0.01871041898122491}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.2132701421800948, "exact_match_stderr,custom-extract": 0.014107954758464308}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.12796697626418987, "exact_match_stderr,custom-extract": 0.010736871151213488}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.19315403422982885, "exact_match_stderr,custom-extract": 0.013811351534162424}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.13385826771653545, "exact_match_stderr,custom-extract": 0.017467280079326578}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12443233424159855, "exact_match_stderr,custom-extract": 0.009952109344107882}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.1376757957068838, "exact_match_stderr,custom-extract": 0.009377718706371134}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.14826839826839827, "exact_match_stderr,custom-extract": 0.011697009106623683}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1523046092184369, "exact_match_stderr,custom-extract": 0.016101330436514027}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.14780600461893764, "exact_match_stderr,custom-extract": 0.009850945752408193}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.23433583959899748, "exact_match_stderr,custom-extract": 0.015004079490362473}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.278, "acc_stderr,none": 0.020055833888070907, "acc_norm,none": 0.384, "acc_norm_stderr,none": 0.021772369465547198}, "piqa": {"alias": "piqa", "acc,none": 0.7149075081610446, "acc_stderr,none": 0.01053327058873894, "acc_norm,none": 0.7176278563656148, "acc_norm_stderr,none": 0.010502821668555361}, "race": {"alias": "race", "acc,none": 0.3712918660287081, "acc_stderr,none": 0.014953126515089408}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43807574206755373, "acc_stderr,none": 0.01122696506802993}, "winogrande": {"alias": "winogrande", "acc,none": 0.6345698500394633, "acc_stderr,none": 0.01353396509763878}} {"created_at": "2025-05-02T00:28:38.541371", "global_step": 420000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.40017064846416384, "acc_stderr,none": 0.01431719778780919, "acc_norm,none": 0.43686006825938567, "acc_norm_stderr,none": 0.014494421584256519}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7058080808080808, "acc_stderr,none": 0.009350328648861737, "acc_norm,none": 0.6835016835016835, "acc_norm_stderr,none": 0.009543851857323888}, "boolq": {"alias": "boolq", "acc,none": 0.7351681957186544, "acc_stderr,none": 0.007717399182659714}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.22522522522522523, "acc_stderr,none": 0.011959591224286246}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816505}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4616610237004581, "acc_stderr,none": 0.0049750910556971995, "acc_norm,none": 0.6167098187612029, "acc_norm_stderr,none": 0.004851944170671268}, "mmlu": {"acc,none": 0.29896026207093007, "acc_stderr,none": 0.0038379323128981987, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2688629117959617, "acc_stderr,none": 0.006448025031341707, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.040735243221471255}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.37575757575757573, "acc_stderr,none": 0.037818873532059816}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.37254901960784315, "acc_stderr,none": 0.03393388584958404}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3080168776371308, "acc_stderr,none": 0.030052389335605705}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.03520893951097654}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.04414343666854934}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.25722543352601157, "acc_stderr,none": 0.02353292543104428}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2446927374301676, "acc_stderr,none": 0.014378169884098414}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.27009646302250806, "acc_stderr,none": 0.025218040373410616}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.27469135802469136, "acc_stderr,none": 0.024836057868294677}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26597131681877445, "acc_stderr,none": 0.011285033165551286}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.19883040935672514, "acc_stderr,none": 0.03061111655743253}, "mmlu_other": {"acc,none": 0.30350820727389766, "acc_stderr,none": 0.008166316537045061, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.35094339622641507, "acc_stderr,none": 0.02937364625323469}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3468208092485549, "acc_stderr,none": 0.03629146670159663}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.18834080717488788, "acc_stderr,none": 0.026241132996407252}, "mmlu_management": {"alias": " - management", "acc,none": 0.42718446601941745, "acc_stderr,none": 0.04897957737781168}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2606837606837607, "acc_stderr,none": 0.028760348956523414}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2835249042145594, "acc_stderr,none": 0.016117318166832272}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3300653594771242, "acc_stderr,none": 0.026925654653615686}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.02635806569888059}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.46691176470588236, "acc_stderr,none": 0.030306257722468314}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.24096385542168675, "acc_stderr,none": 0.033293941190735296}, "mmlu_social_sciences": {"acc,none": 0.33019174520636985, "acc_stderr,none": 0.008432779743880996, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.0414243971948936}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3787878787878788, "acc_stderr,none": 0.03456088731993747}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.35751295336787564, "acc_stderr,none": 0.034588160421810066}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.36666666666666664, "acc_stderr,none": 0.024433016466052455}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3487394957983193, "acc_stderr,none": 0.030956636328566545}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3743119266055046, "acc_stderr,none": 0.020748959408988323}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.29770992366412213, "acc_stderr,none": 0.04010358942462203}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24183006535947713, "acc_stderr,none": 0.017322789207784326}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.041220665028782855}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.39591836734693875, "acc_stderr,none": 0.03130802899065686}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.3582089552238806, "acc_stderr,none": 0.03390393042268815}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.30891214716143356, "acc_stderr,none": 0.008191144247836868, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.34210526315789475, "acc_stderr,none": 0.03860731599316092}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2986111111111111, "acc_stderr,none": 0.03827052357950756}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.045766654032077615}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.31063829787234043, "acc_stderr,none": 0.03025123757921317}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.036951833116502325}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2751322751322751, "acc_stderr,none": 0.023000086859068642}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.34516129032258064, "acc_stderr,none": 0.027045746573534327}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.03178529710642749}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.025928876132766118}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33774834437086093, "acc_stderr,none": 0.038615575462551684}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.4583333333333333, "acc_stderr,none": 0.03398110890294636}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.1690492021276596, "exact_match_stderr,custom-extract": 0.003387277971356538, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.2789400278940028, "exact_match_stderr,custom-extract": 0.016760408620091188}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.1432192648922687, "exact_match_stderr,custom-extract": 0.012478789469306473}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10159010600706714, "exact_match_stderr,custom-extract": 0.00898321237967249}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1975609756097561, "exact_match_stderr,custom-extract": 0.019687698399722342}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.2156398104265403, "exact_match_stderr,custom-extract": 0.014164735026459376}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.13828689370485037, "exact_match_stderr,custom-extract": 0.011095175629353074}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.24083129584352078, "exact_match_stderr,custom-extract": 0.014959413994450006}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1679790026246719, "exact_match_stderr,custom-extract": 0.019177979237567234}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.13805631244323344, "exact_match_stderr,custom-extract": 0.010400907610650267}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.11991117690599556, "exact_match_stderr,custom-extract": 0.00884150509053395}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1590909090909091, "exact_match_stderr,custom-extract": 0.012039164679107935}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1342685370741483, "exact_match_stderr,custom-extract": 0.015277913884522447}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1739799846035412, "exact_match_stderr,custom-extract": 0.010522224976679298}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.24310776942355888, "exact_match_stderr,custom-extract": 0.01519453039465197}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.288, "acc_stderr,none": 0.02027150383507522, "acc_norm,none": 0.386, "acc_norm_stderr,none": 0.021793529219281165}, "piqa": {"alias": "piqa", "acc,none": 0.720892274211099, "acc_stderr,none": 0.010465657948498228, "acc_norm,none": 0.7159956474428727, "acc_norm_stderr,none": 0.010521147542454215}, "race": {"alias": "race", "acc,none": 0.35789473684210527, "acc_stderr,none": 0.014836467904073735}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43193449334698053, "acc_stderr,none": 0.011208746100567539}, "winogrande": {"alias": "winogrande", "acc,none": 0.6408839779005525, "acc_stderr,none": 0.013483115202120243}} {"created_at": "2025-05-02T00:29:54.179788", "global_step": 440000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.3993174061433447, "acc_stderr,none": 0.014312094557946697, "acc_norm,none": 0.44283276450511944, "acc_norm_stderr,none": 0.014515573873348904}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7117003367003367, "acc_stderr,none": 0.009294774252029625, "acc_norm,none": 0.6986531986531986, "acc_norm_stderr,none": 0.009415259879351618}, "boolq": {"alias": "boolq", "acc,none": 0.6813455657492354, "acc_stderr,none": 0.008149598998538518}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.23505323505323505, "acc_stderr,none": 0.012140003367280239}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816505}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4643497311292571, "acc_stderr,none": 0.004977081808179415, "acc_norm,none": 0.6251742680740888, "acc_norm_stderr,none": 0.004830885704380081}, "mmlu": {"acc,none": 0.33008118501637945, "acc_stderr,none": 0.003932322348635091, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3228480340063762, "acc_stderr,none": 0.006762491483932717, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1746031746031746, "acc_stderr,none": 0.033954900208561116}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.38181818181818183, "acc_stderr,none": 0.037937131711656344}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.4362745098039216, "acc_stderr,none": 0.03480693138457039}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.45147679324894513, "acc_stderr,none": 0.0323936001739747}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4297520661157025, "acc_stderr,none": 0.04519082021319773}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.04750077341199985}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.36809815950920244, "acc_stderr,none": 0.03789213935838396}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.315028901734104, "acc_stderr,none": 0.025009313790069706}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2446927374301676, "acc_stderr,none": 0.01437816988409842}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.33762057877813506, "acc_stderr,none": 0.02685882587948854}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.026869490744815244}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3076923076923077, "acc_stderr,none": 0.01178791025166459}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.036155076303109344}, "mmlu_other": {"acc,none": 0.3723849372384937, "acc_stderr,none": 0.008633153717041974, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421255}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3433962264150943, "acc_stderr,none": 0.029224526469124792}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3352601156069364, "acc_stderr,none": 0.035995863012470784}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.45739910313901344, "acc_stderr,none": 0.033435777055830646}, "mmlu_management": {"alias": " - management", "acc,none": 0.36893203883495146, "acc_stderr,none": 0.04777615181156739}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.452991452991453, "acc_stderr,none": 0.0326109987309862}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.41762452107279696, "acc_stderr,none": 0.01763563732695152}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3300653594771242, "acc_stderr,none": 0.026925654653615693}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.32978723404255317, "acc_stderr,none": 0.028045946942042398}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2977941176470588, "acc_stderr,none": 0.027778298701545443}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3132530120481928, "acc_stderr,none": 0.036108050180310235}, "mmlu_social_sciences": {"acc,none": 0.33539161520961974, "acc_stderr,none": 0.008459286854531095, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436695}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.033184773338453315}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.35751295336787564, "acc_stderr,none": 0.03458816042181005}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3128205128205128, "acc_stderr,none": 0.023507579020645365}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.029953823891887048}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3467889908256881, "acc_stderr,none": 0.020406097104093024}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3053435114503817, "acc_stderr,none": 0.04039314978724562}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.35130718954248363, "acc_stderr,none": 0.01931267606578656}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.32727272727272727, "acc_stderr,none": 0.04494290866252091}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2163265306122449, "acc_stderr,none": 0.026358916334904055}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.47761194029850745, "acc_stderr,none": 0.03531987930208731}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_stem": {"acc,none": 0.2940057088487155, "acc_stderr,none": 0.00802077169170192, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.42962962962962964, "acc_stderr,none": 0.04276349494376599}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952924}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.039812405437178615}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33191489361702126, "acc_stderr,none": 0.03078373675774565}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.03565998174135303}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.023068188848261117}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3741935483870968, "acc_stderr,none": 0.027528904299845777}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.1921182266009852, "acc_stderr,none": 0.027719315709614785}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001975}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.026466117538959916}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2781456953642384, "acc_stderr,none": 0.03658603262763744}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.16666666666666666, "acc_stderr,none": 0.025416428388767464}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.375, "acc_stderr,none": 0.04595091388086298}, "mmlu_pro": {"exact_match,custom-extract": 0.16190159574468085, "exact_match_stderr,custom-extract": 0.0033347341340997593, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.26778242677824265, "exact_match_stderr,custom-extract": 0.016548346247565876}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.12547528517110265, "exact_match_stderr,custom-extract": 0.011800544058606071}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10954063604240283, "exact_match_stderr,custom-extract": 0.009286741978082975}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1878048780487805, "exact_match_stderr,custom-extract": 0.0193117650289317}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.20734597156398105, "exact_match_stderr,custom-extract": 0.013962907124731173}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.10835913312693499, "exact_match_stderr,custom-extract": 0.00999056535282795}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2114914425427873, "exact_match_stderr,custom-extract": 0.014286918817398457}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1679790026246719, "exact_match_stderr,custom-extract": 0.019177979237567252}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1335149863760218, "exact_match_stderr,custom-extract": 0.010255319452895692}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.13323464100666174, "exact_match_stderr,custom-extract": 0.009248950748998364}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1634199134199134, "exact_match_stderr,custom-extract": 0.01217041531796006}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15831663326653306, "exact_match_stderr,custom-extract": 0.016357727678825804}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15396458814472672, "exact_match_stderr,custom-extract": 0.010017684430155485}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.22807017543859648, "exact_match_stderr,custom-extract": 0.014862572907065014}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.264, "acc_stderr,none": 0.019732885585922087, "acc_norm,none": 0.402, "acc_norm_stderr,none": 0.021948929609938612}, "piqa": {"alias": "piqa", "acc,none": 0.7290533188248096, "acc_stderr,none": 0.010369718937426843, "acc_norm,none": 0.7236126224156693, "acc_norm_stderr,none": 0.010434162388275619}, "race": {"alias": "race", "acc,none": 0.35406698564593303, "acc_stderr,none": 0.014800834711677318}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4508700102354145, "acc_stderr,none": 0.01125931926927394}, "winogrande": {"alias": "winogrande", "acc,none": 0.6314127861089187, "acc_stderr,none": 0.013558447570099318}} {"created_at": "2025-05-02T01:29:29.294808", "global_step": 460000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.41467576791808874, "acc_stderr,none": 0.014397070564409174, "acc_norm,none": 0.44368600682593856, "acc_norm_stderr,none": 0.014518421825670444}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7251683501683501, "acc_stderr,none": 0.00916053811525495, "acc_norm,none": 0.7201178451178452, "acc_norm_stderr,none": 0.009212077524656534}, "boolq": {"alias": "boolq", "acc,none": 0.7522935779816514, "acc_stderr,none": 0.007550137311318813}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.27764127764127766, "acc_stderr,none": 0.012821491901599497}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816505}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4714200358494324, "acc_stderr,none": 0.004981623292196192, "acc_norm,none": 0.6362278430591516, "acc_norm_stderr,none": 0.004801009657690435}, "mmlu": {"acc,none": 0.3378436120210796, "acc_stderr,none": 0.003969845442777311, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3234856535600425, "acc_stderr,none": 0.0067763449699716335, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047182}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.42424242424242425, "acc_stderr,none": 0.038592681420702615}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.45588235294117646, "acc_stderr,none": 0.03495624522015473}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.45569620253164556, "acc_stderr,none": 0.03241920684693334}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.043913262867240704}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3128834355828221, "acc_stderr,none": 0.03642914578292406}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28901734104046245, "acc_stderr,none": 0.02440517393578323}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2737430167597765, "acc_stderr,none": 0.014912413096372434}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.34726688102893893, "acc_stderr,none": 0.027040745502307336}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.36728395061728397, "acc_stderr,none": 0.026822801759507894}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3161668839634941, "acc_stderr,none": 0.01187578089438658}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.03274485211946956}, "mmlu_other": {"acc,none": 0.3656260057933698, "acc_stderr,none": 0.008629163614504259, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.35471698113207545, "acc_stderr,none": 0.029445175328199586}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3468208092485549, "acc_stderr,none": 0.036291466701596636}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4170403587443946, "acc_stderr,none": 0.03309266936071721}, "mmlu_management": {"alias": " - management", "acc,none": 0.3883495145631068, "acc_stderr,none": 0.0482572933735639}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3974358974358974, "acc_stderr,none": 0.03205953453789293}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.42, "acc_stderr,none": 0.049604496374885836}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.40229885057471265, "acc_stderr,none": 0.01753529452906896}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3104575163398693, "acc_stderr,none": 0.026493033225145894}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3049645390070922, "acc_stderr,none": 0.02746470844202213}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3382352941176471, "acc_stderr,none": 0.028739328513983572}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.03664314777288085}, "mmlu_social_sciences": {"acc,none": 0.35456613584660385, "acc_stderr,none": 0.008600016453676224, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.30701754385964913, "acc_stderr,none": 0.04339138322579859}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.033586181457325226}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.39378238341968913, "acc_stderr,none": 0.03526077095548238}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3076923076923077, "acc_stderr,none": 0.023400928918310485}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31512605042016806, "acc_stderr,none": 0.030176808288974337}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3853211009174312, "acc_stderr,none": 0.02086585085279412}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.31297709923664124, "acc_stderr,none": 0.04066962905677698}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.35947712418300654, "acc_stderr,none": 0.019412539242032165}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04607582090719976}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2938775510204082, "acc_stderr,none": 0.029162738410249772}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.472636815920398, "acc_stderr,none": 0.03530235517334682}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.41, "acc_stderr,none": 0.04943110704237102}, "mmlu_stem": {"acc,none": 0.31557247066286076, "acc_stderr,none": 0.008220836540501075, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621503}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3851851851851852, "acc_stderr,none": 0.042039210401562783}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137282}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.4097222222222222, "acc_stderr,none": 0.04112490974670787}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179962}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33617021276595743, "acc_stderr,none": 0.030881618520676942}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.023068188848261114}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4096774193548387, "acc_stderr,none": 0.02797605491534736}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.025928876132766104}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.3101851851851852, "acc_stderr,none": 0.03154696285656628}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.18209773936170212, "exact_match_stderr,custom-extract": 0.0034753070160499716, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3528591352859135, "exact_match_stderr,custom-extract": 0.01785844267466333}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.14955640050697086, "exact_match_stderr,custom-extract": 0.012704633811017217}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.12014134275618374, "exact_match_stderr,custom-extract": 0.009667660571953616}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.22195121951219512, "exact_match_stderr,custom-extract": 0.02054804589006829}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24407582938388625, "exact_match_stderr,custom-extract": 0.014794071577787442}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1609907120743034, "exact_match_stderr,custom-extract": 0.01181261681476695}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2310513447432763, "exact_match_stderr,custom-extract": 0.014746599750625127}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.17060367454068243, "exact_match_stderr,custom-extract": 0.019296717799305904}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1226158038147139, "exact_match_stderr,custom-extract": 0.009889441958223886}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14211695040710584, "exact_match_stderr,custom-extract": 0.009503205027158606}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1645021645021645, "exact_match_stderr,custom-extract": 0.012202747429494164}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1683366733466934, "exact_match_stderr,custom-extract": 0.016766733998678857}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15473441108545036, "exact_match_stderr,custom-extract": 0.010038127358043917}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2669172932330827, "exact_match_stderr,custom-extract": 0.015668798035500312}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.274, "acc_stderr,none": 0.01996610354027947, "acc_norm,none": 0.384, "acc_norm_stderr,none": 0.021772369465547198}, "piqa": {"alias": "piqa", "acc,none": 0.7285092491838956, "acc_stderr,none": 0.010376251176596135, "acc_norm,none": 0.720892274211099, "acc_norm_stderr,none": 0.010465657948498231}, "race": {"alias": "race", "acc,none": 0.3722488038277512, "acc_stderr,none": 0.014960984760899333}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44626407369498466, "acc_stderr,none": 0.011248540901547963}, "winogrande": {"alias": "winogrande", "acc,none": 0.6566692975532754, "acc_stderr,none": 0.013344823185358016}} {"created_at": "2025-05-02T01:29:53.037302", "global_step": 456000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4104095563139932, "acc_stderr,none": 0.01437492219264266, "acc_norm,none": 0.43686006825938567, "acc_norm_stderr,none": 0.014494421584256513}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7255892255892256, "acc_stderr,none": 0.009156177122244525, "acc_norm,none": 0.7117003367003367, "acc_norm_stderr,none": 0.009294774252029625}, "boolq": {"alias": "boolq", "acc,none": 0.7345565749235474, "acc_stderr,none": 0.007723090983590469}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.25634725634725636, "acc_stderr,none": 0.012500273456369256}, "copa": {"alias": "copa", "acc,none": 0.76, "acc_stderr,none": 0.04292346959909284}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47122087233618803, "acc_stderr,none": 0.004981509099276348, "acc_norm,none": 0.6349332802230632, "acc_norm_stderr,none": 0.004804649197163699}, "mmlu": {"acc,none": 0.3200398803589232, "acc_stderr,none": 0.003915836239557328, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3073326248671626, "acc_stderr,none": 0.006705491263889471, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.0361960452412425}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4, "acc_stderr,none": 0.03825460278380026}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.4068627450980392, "acc_stderr,none": 0.03447891136353382}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.3628691983122363, "acc_stderr,none": 0.03129920825530213}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.33884297520661155, "acc_stderr,none": 0.04320767807536671}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3425925925925926, "acc_stderr,none": 0.045879047413018105}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.26380368098159507, "acc_stderr,none": 0.03462419931615623}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.02494679222527231}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.26256983240223464, "acc_stderr,none": 0.014716824273017756}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3504823151125402, "acc_stderr,none": 0.027098652621301747}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.31790123456790126, "acc_stderr,none": 0.02591006352824088}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30378096479791394, "acc_stderr,none": 0.011745787720472472}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.25146198830409355, "acc_stderr,none": 0.033275044238468436}, "mmlu_other": {"acc,none": 0.3453492114579981, "acc_stderr,none": 0.008500614286239612, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.32075471698113206, "acc_stderr,none": 0.028727502957880263}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.03550683989165582}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4080717488789238, "acc_stderr,none": 0.03298574607842822}, "mmlu_management": {"alias": " - management", "acc,none": 0.34951456310679613, "acc_stderr,none": 0.047211885060971716}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.03193705726200293}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001974}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.39719029374201786, "acc_stderr,none": 0.017497905037159363}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3104575163398693, "acc_stderr,none": 0.026493033225145894}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340460997}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2757352941176471, "acc_stderr,none": 0.027146271936625166}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3253012048192771, "acc_stderr,none": 0.03647168523683229}, "mmlu_social_sciences": {"acc,none": 0.3392915177120572, "acc_stderr,none": 0.008500942315406537, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537315}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2676767676767677, "acc_stderr,none": 0.031544498882702866}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.37305699481865284, "acc_stderr,none": 0.03490205592048574}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3153846153846154, "acc_stderr,none": 0.02355964698318994}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31092436974789917, "acc_stderr,none": 0.030066761582977934}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3651376146788991, "acc_stderr,none": 0.02064280145438401}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3435114503816794, "acc_stderr,none": 0.041649760719448786}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3480392156862745, "acc_stderr,none": 0.019270998708223977}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.35454545454545455, "acc_stderr,none": 0.04582004841505416}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2530612244897959, "acc_stderr,none": 0.027833023871399683}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48258706467661694, "acc_stderr,none": 0.03533389234739245}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_stem": {"acc,none": 0.2952743418966064, "acc_stderr,none": 0.008062967013205875, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4, "acc_stderr,none": 0.04232073695151589}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.03554180368025689}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3819444444444444, "acc_stderr,none": 0.040629907841466674}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.050241839379569095}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.34893617021276596, "acc_stderr,none": 0.031158522131357776}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.02278967314577657}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.36774193548387096, "acc_stderr,none": 0.027430866579973467}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03144712581678241}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.035433042343899844}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422252}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.04287858751340456}, "mmlu_pro": {"exact_match,custom-extract": 0.17253989361702127, "exact_match_stderr,custom-extract": 0.003417589322855389, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.2789400278940028, "exact_match_stderr,custom-extract": 0.016760408620091195}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.14702154626108999, "exact_match_stderr,custom-extract": 0.0126152660719504}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10159010600706714, "exact_match_stderr,custom-extract": 0.00898321237967247}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.20243902439024392, "exact_match_stderr,custom-extract": 0.019868606646141387}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.22274881516587677, "exact_match_stderr,custom-extract": 0.014330937777746751}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.14035087719298245, "exact_match_stderr,custom-extract": 0.011164274322169068}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22493887530562348, "exact_match_stderr,custom-extract": 0.014607947807460348}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16535433070866143, "exact_match_stderr,custom-extract": 0.01905754968679375}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.14259763851044505, "exact_match_stderr,custom-extract": 0.010542707604685763}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.1391561806069578, "exact_match_stderr,custom-extract": 0.009419905559243661}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17424242424242425, "exact_match_stderr,custom-extract": 0.01248539783451787}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1683366733466934, "exact_match_stderr,custom-extract": 0.016766733998678864}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15088529638183218, "exact_match_stderr,custom-extract": 0.009935032556231118}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2568922305764411, "exact_match_stderr,custom-extract": 0.015476481223905561}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.282, "acc_stderr,none": 0.02014357284729078, "acc_norm,none": 0.38, "acc_norm_stderr,none": 0.02172888143870171}, "piqa": {"alias": "piqa", "acc,none": 0.7252448313384113, "acc_stderr,none": 0.010415033676676039, "acc_norm,none": 0.7230685527747551, "acc_norm_stderr,none": 0.010440499969334535}, "race": {"alias": "race", "acc,none": 0.3703349282296651, "acc_stderr,none": 0.014945205447391755}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.43551688843398156, "acc_stderr,none": 0.011219586604022601}, "winogrande": {"alias": "winogrande", "acc,none": 0.6432517758484609, "acc_stderr,none": 0.01346339395802871}} {"created_at": "2025-05-02T01:36:07.479121", "global_step": 458000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4061433447098976, "acc_stderr,none": 0.014351656690097863, "acc_norm,none": 0.4300341296928328, "acc_norm_stderr,none": 0.014467631559137996}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7226430976430976, "acc_stderr,none": 0.009186490105111904, "acc_norm,none": 0.7087542087542088, "acc_norm_stderr,none": 0.009322788837938863}, "boolq": {"alias": "boolq", "acc,none": 0.7256880733944954, "acc_stderr,none": 0.007803507959383008}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2571662571662572, "acc_stderr,none": 0.012513329723602728}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816505}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4690300736904999, "acc_stderr,none": 0.00498020045185168, "acc_norm,none": 0.6331408086038638, "acc_norm_stderr,none": 0.004809626723626839}, "mmlu": {"acc,none": 0.31825950719270757, "acc_stderr,none": 0.0039118293259077916, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.30456960680127526, "acc_stderr,none": 0.00668018764168108, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.37575757575757573, "acc_stderr,none": 0.03781887353205983}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.4264705882352941, "acc_stderr,none": 0.03471157907953426}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.379746835443038, "acc_stderr,none": 0.0315918875296585}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.30578512396694213, "acc_stderr,none": 0.04205953933884122}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.04668408033024932}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.31901840490797545, "acc_stderr,none": 0.03661997551073836}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.28901734104046245, "acc_stderr,none": 0.02440517393578323}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225608}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.34726688102893893, "acc_stderr,none": 0.02704074550230734}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.32098765432098764, "acc_stderr,none": 0.025976566010862744}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30378096479791394, "acc_stderr,none": 0.01174578772047246}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23391812865497075, "acc_stderr,none": 0.03246721765117827}, "mmlu_other": {"acc,none": 0.3482458963630512, "acc_stderr,none": 0.008530734427513235, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3471698113207547, "acc_stderr,none": 0.029300101705549652}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3063583815028902, "acc_stderr,none": 0.03514942551267438}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.42152466367713004, "acc_stderr,none": 0.033141902221106564}, "mmlu_management": {"alias": " - management", "acc,none": 0.34951456310679613, "acc_stderr,none": 0.047211885060971716}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.39316239316239315, "acc_stderr,none": 0.03199957924651047}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145633}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.384418901660281, "acc_stderr,none": 0.01739568874281962}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.28104575163398693, "acc_stderr,none": 0.025738854797818726}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2978723404255319, "acc_stderr,none": 0.027281608344469414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.31985294117647056, "acc_stderr,none": 0.028332959514031225}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.03664314777288085}, "mmlu_social_sciences": {"acc,none": 0.3311667208319792, "acc_stderr,none": 0.008461161595148096, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3508771929824561, "acc_stderr,none": 0.04489539350270698}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.26262626262626265, "acc_stderr,none": 0.031353050095330855}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.35233160621761656, "acc_stderr,none": 0.03447478286414358}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2923076923076923, "acc_stderr,none": 0.023060438380857726}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2773109243697479, "acc_stderr,none": 0.02907937453948001}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3577981651376147, "acc_stderr,none": 0.020552060784827825}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.33587786259541985, "acc_stderr,none": 0.041423137719966634}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3464052287581699, "acc_stderr,none": 0.01924978569171721}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.35454545454545455, "acc_stderr,none": 0.045820048415054174}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2653061224489796, "acc_stderr,none": 0.0282638899437846}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.43781094527363185, "acc_stderr,none": 0.0350808011219984}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_stem": {"acc,none": 0.29654297494449733, "acc_stderr,none": 0.008080207838080783, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.37777777777777777, "acc_stderr,none": 0.04188307537595853}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03583496176361063}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.040166600304512336}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3446808510638298, "acc_stderr,none": 0.03106898596312215}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.037800192304380135}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.02278967314577657}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.36774193548387096, "acc_stderr,none": 0.02743086657997347}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.28078817733990147, "acc_stderr,none": 0.0316185633535861}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.02659393910184407}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.034791855725996586}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02835321286686346}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3482142857142857, "acc_stderr,none": 0.04521829902833586}, "mmlu_pro": {"exact_match,custom-extract": 0.1802692819148936, "exact_match_stderr,custom-extract": 0.003467064417445248, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.32217573221757323, "exact_match_stderr,custom-extract": 0.01746419040868443}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.17490494296577946, "exact_match_stderr,custom-extract": 0.01353287048704677}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11484098939929328, "exact_match_stderr,custom-extract": 0.009480425016358224}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1926829268292683, "exact_match_stderr,custom-extract": 0.019502129313734507}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.235781990521327, "exact_match_stderr,custom-extract": 0.014620093887364653}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.15583075335397317, "exact_match_stderr,custom-extract": 0.011657452909176063}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.24449877750611246, "exact_match_stderr,custom-extract": 0.01503643576839438}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.15223097112860892, "exact_match_stderr,custom-extract": 0.018428860558049234}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1262488646684832, "exact_match_stderr,custom-extract": 0.010014085027799643}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14655810510732792, "exact_match_stderr,custom-extract": 0.009625538058445092}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1645021645021645, "exact_match_stderr,custom-extract": 0.012202747429494166}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1282565130260521, "exact_match_stderr,custom-extract": 0.014983711363001561}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16320246343341033, "exact_match_stderr,custom-extract": 0.010257374338618739}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2731829573934837, "exact_match_stderr,custom-extract": 0.0157837499294662}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.268, "acc_stderr,none": 0.019827714859587564, "acc_norm,none": 0.39, "acc_norm_stderr,none": 0.02183468586936921}, "piqa": {"alias": "piqa", "acc,none": 0.7257889009793254, "acc_stderr,none": 0.010408618664933382, "acc_norm,none": 0.7219804134929271, "acc_norm_stderr,none": 0.010453117358332813}, "race": {"alias": "race", "acc,none": 0.37607655502392345, "acc_stderr,none": 0.014991791489173295}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44268167860798363, "acc_stderr,none": 0.011239482425741956}, "winogrande": {"alias": "winogrande", "acc,none": 0.6471981057616417, "acc_stderr,none": 0.013429728101788956}} {"created_at": "2025-05-02T02:22:12.165354", "global_step": 462000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4138225255972696, "acc_stderr,none": 0.01439273000922101, "acc_norm,none": 0.4351535836177474, "acc_norm_stderr,none": 0.014487986197186047}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7222222222222222, "acc_stderr,none": 0.009190779909649916, "acc_norm,none": 0.7062289562289562, "acc_norm_stderr,none": 0.009346423298166723}, "boolq": {"alias": "boolq", "acc,none": 0.755045871559633, "acc_stderr,none": 0.007521796682240532}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.3013923013923014, "acc_stderr,none": 0.013137201028519572}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816505}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47102170882294364, "acc_stderr,none": 0.004981394110706143, "acc_norm,none": 0.6400119498107947, "acc_norm_stderr,none": 0.0047901553709934494}, "mmlu": {"acc,none": 0.34553482409913117, "acc_stderr,none": 0.003983625970694228, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3249734325185972, "acc_stderr,none": 0.0067813427457944055, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.03852273364924316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.41818181818181815, "acc_stderr,none": 0.03851716319398394}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.43137254901960786, "acc_stderr,none": 0.03476099060501637}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4219409282700422, "acc_stderr,none": 0.032148146302403695}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.38016528925619836, "acc_stderr,none": 0.04431324501968432}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3425925925925926, "acc_stderr,none": 0.045879047413018105}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.25153374233128833, "acc_stderr,none": 0.034089978868575295}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.024257901705323374}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2435754189944134, "acc_stderr,none": 0.014355911964767864}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3633440514469453, "acc_stderr,none": 0.02731684767419271}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.4012345679012346, "acc_stderr,none": 0.02727258284983979}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3272490221642764, "acc_stderr,none": 0.01198381980646475}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.32748538011695905, "acc_stderr,none": 0.035993357714560276}, "mmlu_other": {"acc,none": 0.37850016092693917, "acc_stderr,none": 0.00869037872037017, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.37358490566037733, "acc_stderr,none": 0.02977308271331987}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.035331333893236574}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.39461883408071746, "acc_stderr,none": 0.03280400504755291}, "mmlu_management": {"alias": " - management", "acc,none": 0.4368932038834951, "acc_stderr,none": 0.04911147107365777}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4017094017094017, "acc_stderr,none": 0.03211693751051621}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.45, "acc_stderr,none": 0.049999999999999996}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4112388250319285, "acc_stderr,none": 0.01759597190805657}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3627450980392157, "acc_stderr,none": 0.027530078447110307}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.028267657482650144}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3897058823529412, "acc_stderr,none": 0.029624663581159696}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3493975903614458, "acc_stderr,none": 0.03711725190740748}, "mmlu_social_sciences": {"acc,none": 0.3669158271043224, "acc_stderr,none": 0.008609939958050058, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04434600701584926}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3282828282828283, "acc_stderr,none": 0.03345678422756776}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43005181347150256, "acc_stderr,none": 0.035729543331448094}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3230769230769231, "acc_stderr,none": 0.02371088850197056}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31932773109243695, "acc_stderr,none": 0.0302839955258844}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.43119266055045874, "acc_stderr,none": 0.02123336503031956}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467765}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3431372549019608, "acc_stderr,none": 0.01920660684882536}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.04350271442923243}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2816326530612245, "acc_stderr,none": 0.028795185574291293}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5373134328358209, "acc_stderr,none": 0.03525675167467974}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_stem": {"acc,none": 0.3228671106882334, "acc_stderr,none": 0.008271307715233647, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.37777777777777777, "acc_stderr,none": 0.04188307537595853}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.037385206761196686}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.04076663253918567}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.3, "acc_stderr,none": 0.04605661864718381}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.04576665403207763}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.35319148936170214, "acc_stderr,none": 0.031245325202761926}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.32413793103448274, "acc_stderr,none": 0.03900432069185555}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.28835978835978837, "acc_stderr,none": 0.0233306540545359}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4161290322580645, "acc_stderr,none": 0.02804098138076154}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.03178529710642749}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145632}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2074074074074074, "acc_stderr,none": 0.024720713193952176}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.03603038545360384}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.35185185185185186, "acc_stderr,none": 0.032568505702936464}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.2767857142857143, "acc_stderr,none": 0.04246624336697624}, "mmlu_pro": {"exact_match,custom-extract": 0.17553191489361702, "exact_match_stderr,custom-extract": 0.0034342487056358705, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.31659693165969316, "exact_match_stderr,custom-extract": 0.017383423143106428}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.14068441064638784, "exact_match_stderr,custom-extract": 0.012386146840836963}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10777385159010601, "exact_match_stderr,custom-extract": 0.00922067835765709}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.2, "exact_match_stderr,custom-extract": 0.019778727057365938}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.23104265402843602, "exact_match_stderr,custom-extract": 0.014517218722607162}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.15892672858617132, "exact_match_stderr,custom-extract": 0.011751078002557007}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2310513447432763, "exact_match_stderr,custom-extract": 0.014746599750625125}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16010498687664043, "exact_match_stderr,custom-extract": 0.018811487255752723}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.14713896457765668, "exact_match_stderr,custom-extract": 0.010680870311860846}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.13693560325684678, "exact_match_stderr,custom-extract": 0.009356488887893854}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.14935064935064934, "exact_match_stderr,custom-extract": 0.011732160468660269}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.16032064128256512, "exact_match_stderr,custom-extract": 0.016441324005775013}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15396458814472672, "exact_match_stderr,custom-extract": 0.010017684430155488}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2581453634085213, "exact_match_stderr,custom-extract": 0.015501096199517839}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.27, "acc_stderr,none": 0.01987435483128748, "acc_norm,none": 0.392, "acc_norm_stderr,none": 0.021854684955611263}, "piqa": {"alias": "piqa", "acc,none": 0.7279651795429815, "acc_stderr,none": 0.010382763786247383, "acc_norm,none": 0.7181719260065288, "acc_norm_stderr,none": 0.010496675231258155}, "race": {"alias": "race", "acc,none": 0.37799043062200954, "acc_stderr,none": 0.015006820447473677}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4508700102354145, "acc_stderr,none": 0.011259319269273937}, "winogrande": {"alias": "winogrande", "acc,none": 0.6408839779005525, "acc_stderr,none": 0.013483115202120243}} {"created_at": "2025-05-02T04:45:20.614331", "global_step": 464000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.41638225255972694, "acc_stderr,none": 0.014405618279436167, "acc_norm,none": 0.44197952218430037, "acc_norm_stderr,none": 0.014512682523128345}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7218013468013468, "acc_stderr,none": 0.009195059601583897, "acc_norm,none": 0.7108585858585859, "acc_norm_stderr,none": 0.009302827114597427}, "boolq": {"alias": "boolq", "acc,none": 0.7223241590214067, "acc_stderr,none": 0.007832991548377194}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.3194103194103194, "acc_stderr,none": 0.013348646620922276}, "copa": {"alias": "copa", "acc,none": 0.78, "acc_stderr,none": 0.041633319989322626}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.46992630950009956, "acc_stderr,none": 0.004980747448813316, "acc_norm,none": 0.6346345349531965, "acc_norm_stderr,none": 0.004805483767055349}, "mmlu": {"acc,none": 0.35336846603048, "acc_stderr,none": 0.004001339563566479, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.33538788522848034, "acc_stderr,none": 0.0068102534764985356, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03718489006818114}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.37575757575757573, "acc_stderr,none": 0.037818873532059816}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.4803921568627451, "acc_stderr,none": 0.03506612560524866}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4810126582278481, "acc_stderr,none": 0.03252375148090448}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4380165289256198, "acc_stderr,none": 0.045291468044357915}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.04750077341199985}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3067484662576687, "acc_stderr,none": 0.03623089915724146}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.025070713719153176}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23128491620111732, "acc_stderr,none": 0.014102223623152594}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.37942122186495175, "acc_stderr,none": 0.027559949802347824}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.39814814814814814, "acc_stderr,none": 0.02723741509459247}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3305084745762712, "acc_stderr,none": 0.012014142101842968}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3391812865497076, "acc_stderr,none": 0.03631053496488905}, "mmlu_other": {"acc,none": 0.38429353073704536, "acc_stderr,none": 0.008713132677256766, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.36981132075471695, "acc_stderr,none": 0.029711421880107922}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.37572254335260113, "acc_stderr,none": 0.03692820767264867}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.43946188340807174, "acc_stderr,none": 0.03331092511038179}, "mmlu_management": {"alias": " - management", "acc,none": 0.4077669902912621, "acc_stderr,none": 0.04865777570410769}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.44017094017094016, "acc_stderr,none": 0.032520741720630506}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.45, "acc_stderr,none": 0.04999999999999999}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.3997445721583653, "acc_stderr,none": 0.01751684790705328}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.39215686274509803, "acc_stderr,none": 0.027956046165424523}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.31560283687943264, "acc_stderr,none": 0.027724989449509314}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.375, "acc_stderr,none": 0.029408372932278746}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.38056548586285344, "acc_stderr,none": 0.008687529319790162, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.32456140350877194, "acc_stderr,none": 0.044045561573747685}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.35353535353535354, "acc_stderr,none": 0.03406086723547153}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.42487046632124353, "acc_stderr,none": 0.035674713352125395}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3435897435897436, "acc_stderr,none": 0.024078696580635477}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2815126050420168, "acc_stderr,none": 0.029213549414372163}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.43486238532110094, "acc_stderr,none": 0.021254631465609283}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.35877862595419846, "acc_stderr,none": 0.04206739313864908}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.34967320261437906, "acc_stderr,none": 0.019291961895066382}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3090909090909091, "acc_stderr,none": 0.044262946482000985}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.363265306122449, "acc_stderr,none": 0.030789051139030806}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5422885572139303, "acc_stderr,none": 0.03522865864099597}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.51, "acc_stderr,none": 0.05024183937956912}, "mmlu_stem": {"acc,none": 0.32318426895020613, "acc_stderr,none": 0.008287426452089752, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.42962962962962964, "acc_stderr,none": 0.04276349494376599}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640766}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.4166666666666667, "acc_stderr,none": 0.04122728707651283}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.045766654032077636}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.46, "acc_stderr,none": 0.05009082659620333}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3276595744680851, "acc_stderr,none": 0.030683020843231008}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.03752833958003336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.023068188848261114}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3967741935483871, "acc_stderr,none": 0.027831231605767958}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.032550867699701044}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.025787874220959316}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.32450331125827814, "acc_stderr,none": 0.038227469376587525}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.33796296296296297, "acc_stderr,none": 0.03225941352631295}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285713}, "mmlu_pro": {"exact_match,custom-extract": 0.17852393617021275, "exact_match_stderr,custom-extract": 0.003451886250189789, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3333333333333333, "exact_match_stderr,custom-extract": 0.017617214086056415}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.15082382762991128, "exact_match_stderr,custom-extract": 0.012748842917983895}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.1157243816254417, "exact_match_stderr,custom-extract": 0.009512068239624279}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.2097560975609756, "exact_match_stderr,custom-extract": 0.020131503920840912}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24170616113744076, "exact_match_stderr,custom-extract": 0.014745137840103603}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.14963880288957687, "exact_match_stderr,custom-extract": 0.011465319227973019}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2334963325183374, "exact_match_stderr,custom-extract": 0.01480083176145419}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1679790026246719, "exact_match_stderr,custom-extract": 0.019177979237567255}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12534059945504086, "exact_match_stderr,custom-extract": 0.0099831828402155}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.13545521835677277, "exact_match_stderr,custom-extract": 0.00931375335793057}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1634199134199134, "exact_match_stderr,custom-extract": 0.01217041531796006}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1623246492985972, "exact_match_stderr,custom-extract": 0.016524009398562543}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1585835257890685, "exact_match_stderr,custom-extract": 0.010139048344815254}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2631578947368421, "exact_match_stderr,custom-extract": 0.0155979045489076}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.276, "acc_stderr,none": 0.02001121929807353, "acc_norm,none": 0.384, "acc_norm_stderr,none": 0.0217723694655472}, "piqa": {"alias": "piqa", "acc,none": 0.7247007616974973, "acc_stderr,none": 0.01042142927736953, "acc_norm,none": 0.719804134929271, "acc_norm_stderr,none": 0.01047812201557708}, "race": {"alias": "race", "acc,none": 0.37607655502392345, "acc_stderr,none": 0.014991791489173296}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44779938587512796, "acc_stderr,none": 0.011252242102001767}, "winogrande": {"alias": "winogrande", "acc,none": 0.6448303078137332, "acc_stderr,none": 0.013450047479569256}} {"created_at": "2025-05-02T06:47:54.148041", "global_step": 466000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4197952218430034, "acc_stderr,none": 0.014422181226303022, "acc_norm,none": 0.44197952218430037, "acc_norm_stderr,none": 0.014512682523128345}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7268518518518519, "acc_stderr,none": 0.009143032718360342, "acc_norm,none": 0.7133838383838383, "acc_norm_stderr,none": 0.009278551100969295}, "boolq": {"alias": "boolq", "acc,none": 0.7201834862385321, "acc_stderr,none": 0.00785146664100135}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.29811629811629814, "acc_stderr,none": 0.013096206775112647}, "copa": {"alias": "copa", "acc,none": 0.78, "acc_stderr,none": 0.041633319989322626}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4726150169288986, "acc_stderr,none": 0.004982291744069912, "acc_norm,none": 0.6401115315674168, "acc_norm_stderr,none": 0.004789865379084507}, "mmlu": {"acc,none": 0.34218772254664576, "acc_stderr,none": 0.003973081861618455, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.32518597236981933, "acc_stderr,none": 0.006766156713538489, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.0361960452412425}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.42424242424242425, "acc_stderr,none": 0.03859268142070262}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.49019607843137253, "acc_stderr,none": 0.03508637358630572}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.47257383966244726, "acc_stderr,none": 0.032498227183013026}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4132231404958678, "acc_stderr,none": 0.04495087843548408}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3425925925925926, "acc_stderr,none": 0.045879047413018105}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.0360251131880677}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.024946792225272314}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217887}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.34726688102893893, "acc_stderr,none": 0.027040745502307336}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3487654320987654, "acc_stderr,none": 0.02651759772446501}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3161668839634941, "acc_stderr,none": 0.01187578089438658}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.381074991953653, "acc_stderr,none": 0.008684094703851232, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3660377358490566, "acc_stderr,none": 0.029647813539365245}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3179190751445087, "acc_stderr,none": 0.03550683989165582}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4170403587443946, "acc_stderr,none": 0.03309266936071721}, "mmlu_management": {"alias": " - management", "acc,none": 0.4174757281553398, "acc_stderr,none": 0.04882840548212238}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4444444444444444, "acc_stderr,none": 0.03255326307272487}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.41, "acc_stderr,none": 0.04943110704237102}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4342273307790549, "acc_stderr,none": 0.01772458938967779}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3660130718954248, "acc_stderr,none": 0.02758281141515961}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.32978723404255317, "acc_stderr,none": 0.028045946942042398}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3161764705882353, "acc_stderr,none": 0.028245687391462923}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.035509201856896294}, "mmlu_social_sciences": {"acc,none": 0.36106597335066626, "acc_stderr,none": 0.008629922063628219, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.04434600701584926}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3282828282828283, "acc_stderr,none": 0.03345678422756776}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.40414507772020725, "acc_stderr,none": 0.0354150857888402}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3128205128205128, "acc_stderr,none": 0.023507579020645365}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3277310924369748, "acc_stderr,none": 0.03048991141767323}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3889908256880734, "acc_stderr,none": 0.020902300887392873}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3053435114503817, "acc_stderr,none": 0.040393149787245605}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3758169934640523, "acc_stderr,none": 0.019594021136577443}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.34545454545454546, "acc_stderr,none": 0.04554619617541054}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0289205832206756}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.46766169154228854, "acc_stderr,none": 0.035281314729336065}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.46, "acc_stderr,none": 0.05009082659620333}, "mmlu_stem": {"acc,none": 0.3108150967332699, "acc_stderr,none": 0.00818439307167464, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.04244633238353228}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.375, "acc_stderr,none": 0.04048439222695598}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171453}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.34893617021276596, "acc_stderr,none": 0.031158522131357773}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03855289616378948}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.02271746789770861}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4161290322580645, "acc_stderr,none": 0.028040981380761543}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.03178529710642749}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26296296296296295, "acc_stderr,none": 0.02684205787383371}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.26490066225165565, "acc_stderr,none": 0.036030385453603826}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.02896370257079102}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.17852393617021275, "exact_match_stderr,custom-extract": 0.0034543734679263758, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.32217573221757323, "exact_match_stderr,custom-extract": 0.017464190408684424}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16856780735107732, "exact_match_stderr,custom-extract": 0.013336369763619692}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11307420494699646, "exact_match_stderr,custom-extract": 0.009416599764522278}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.20243902439024392, "exact_match_stderr,custom-extract": 0.01986860664614139}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.245260663507109, "exact_match_stderr,custom-extract": 0.014818309281701548}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1568627450980392, "exact_match_stderr,custom-extract": 0.0116888387242459}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.23471882640586797, "exact_match_stderr,custom-extract": 0.014827688336877391}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16272965879265092, "exact_match_stderr,custom-extract": 0.018935396882827783}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12443233424159855, "exact_match_stderr,custom-extract": 0.009952109344107885}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.12953367875647667, "exact_match_stderr,custom-extract": 0.009139037343919207}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.16233766233766234, "exact_match_stderr,custom-extract": 0.01213789253691545}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.17835671342685372, "exact_match_stderr,custom-extract": 0.017154249195657556}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15704387990762125, "exact_match_stderr,custom-extract": 0.010098936605569616}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2568922305764411, "exact_match_stderr,custom-extract": 0.01547648122390555}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.266, "acc_stderr,none": 0.019780559675655483, "acc_norm,none": 0.392, "acc_norm_stderr,none": 0.021854684955611263}, "piqa": {"alias": "piqa", "acc,none": 0.7274211099020674, "acc_stderr,none": 0.010389256803296018, "acc_norm,none": 0.7219804134929271, "acc_norm_stderr,none": 0.010453117358332816}, "race": {"alias": "race", "acc,none": 0.37320574162679426, "acc_stderr,none": 0.014968780283761241}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4539406345957011, "acc_stderr,none": 0.01126596346751048}, "winogrande": {"alias": "winogrande", "acc,none": 0.6527229676400947, "acc_stderr,none": 0.013380909249751239}} {"created_at": "2025-05-02T08:26:36.736335", "global_step": 468000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4087030716723549, "acc_stderr,none": 0.014365750345427001, "acc_norm,none": 0.43856655290102387, "acc_norm_stderr,none": 0.014500682618212864}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7268518518518519, "acc_stderr,none": 0.00914303271836034, "acc_norm,none": 0.7142255892255892, "acc_norm_stderr,none": 0.009270380606981212}, "boolq": {"alias": "boolq", "acc,none": 0.746177370030581, "acc_stderr,none": 0.007611648880497177}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.28665028665028663, "acc_stderr,none": 0.012946355263212262}, "copa": {"alias": "copa", "acc,none": 0.8, "acc_stderr,none": 0.04020151261036845}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47321250746863175, "acc_stderr,none": 0.004982615233057107, "acc_norm,none": 0.6397132045409281, "acc_norm_stderr,none": 0.004791024004587998}, "mmlu": {"acc,none": 0.35336846603048, "acc_stderr,none": 0.003999673283279984, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3394261424017003, "acc_stderr,none": 0.006831067095024671, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.038522733649243156}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3939393939393939, "acc_stderr,none": 0.038154943086889305}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.4852941176470588, "acc_stderr,none": 0.03507793834791324}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4641350210970464, "acc_stderr,none": 0.03246338898055659}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.47107438016528924, "acc_stderr,none": 0.04556710331269498}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.39814814814814814, "acc_stderr,none": 0.04732332615978815}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.34355828220858897, "acc_stderr,none": 0.03731133519673891}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3092485549132948, "acc_stderr,none": 0.024883140570071755}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2335195530726257, "acc_stderr,none": 0.014149575348976266}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3762057877813505, "acc_stderr,none": 0.027513925683549427}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.41358024691358025, "acc_stderr,none": 0.027402042040269952}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.33116036505867014, "acc_stderr,none": 0.012020128195985759}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3567251461988304, "acc_stderr,none": 0.03674013002860954}, "mmlu_other": {"acc,none": 0.38461538461538464, "acc_stderr,none": 0.00870506951597864, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.36981132075471695, "acc_stderr,none": 0.029711421880107922}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3699421965317919, "acc_stderr,none": 0.036812296333943194}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4125560538116592, "acc_stderr,none": 0.03304062175449297}, "mmlu_management": {"alias": " - management", "acc,none": 0.3786407766990291, "acc_stderr,none": 0.04802694698258974}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.48717948717948717, "acc_stderr,none": 0.032745319388423504}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.049888765156985884}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.41379310344827586, "acc_stderr,none": 0.017612204084663772}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.38235294117647056, "acc_stderr,none": 0.027826109307283697}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.32269503546099293, "acc_stderr,none": 0.02788913930053478}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3382352941176471, "acc_stderr,none": 0.028739328513983576}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.29518072289156627, "acc_stderr,none": 0.035509201856896294}, "mmlu_social_sciences": {"acc,none": 0.37764055898602533, "acc_stderr,none": 0.008686693331236915, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.32456140350877194, "acc_stderr,none": 0.044045561573747685}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.034273086529999344}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.47668393782383417, "acc_stderr,none": 0.03604513672442205}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3230769230769231, "acc_stderr,none": 0.02371088850197057}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.03038835355188684}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.4091743119266055, "acc_stderr,none": 0.02108067026443373}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.33587786259541985, "acc_stderr,none": 0.04142313771996663}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.369281045751634, "acc_stderr,none": 0.01952431674486635}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.046313813194254635}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2938775510204082, "acc_stderr,none": 0.02916273841024978}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5074626865671642, "acc_stderr,none": 0.035351400842767194}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_stem": {"acc,none": 0.3196955280685062, "acc_stderr,none": 0.008242128107631188, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.04244633238353228}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3819444444444444, "acc_stderr,none": 0.040629907841466674}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3148936170212766, "acc_stderr,none": 0.030363582197238174}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3310344827586207, "acc_stderr,none": 0.039215453124671215}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.02326651221373057}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.43548387096774194, "acc_stderr,none": 0.028206225591502737}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.031785297106427496}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.02592887613276611}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.037101857261199946}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.030998666304560534}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.375, "acc_stderr,none": 0.04595091388086298}, "mmlu_pro": {"exact_match,custom-extract": 0.17869015957446807, "exact_match_stderr,custom-extract": 0.0034591684992365955, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3110181311018131, "exact_match_stderr,custom-extract": 0.017299766412175892}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.15716096324461343, "exact_match_stderr,custom-extract": 0.012965269640409612}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10777385159010601, "exact_match_stderr,custom-extract": 0.00922067835765708}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1951219512195122, "exact_match_stderr,custom-extract": 0.01959550465961621}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24644549763033174, "exact_match_stderr,custom-extract": 0.014842395208509539}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1475748194014448, "exact_match_stderr,custom-extract": 0.011399783059415297}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22982885085574573, "exact_match_stderr,custom-extract": 0.014719222403513966}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16272965879265092, "exact_match_stderr,custom-extract": 0.018935396882827787}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.14259763851044505, "exact_match_stderr,custom-extract": 0.010542707604685762}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14877868245743894, "exact_match_stderr,custom-extract": 0.00968555954916866}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.170995670995671, "exact_match_stderr,custom-extract": 0.012392818993997882}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.18236472945891782, "exact_match_stderr,custom-extract": 0.017303563884617013}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1424172440338722, "exact_match_stderr,custom-extract": 0.009700228446933839}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2606516290726817, "exact_match_stderr,custom-extract": 0.015549829134623425}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.272, "acc_stderr,none": 0.019920483209566072, "acc_norm,none": 0.392, "acc_norm_stderr,none": 0.021854684955611263}, "piqa": {"alias": "piqa", "acc,none": 0.7236126224156693, "acc_stderr,none": 0.010434162388275619, "acc_norm,none": 0.7154515778019587, "acc_norm_stderr,none": 0.010527218464130617}, "race": {"alias": "race", "acc,none": 0.3770334928229665, "acc_stderr,none": 0.014999337089843356}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44882292732855683, "acc_stderr,none": 0.011254649314820134}, "winogrande": {"alias": "winogrande", "acc,none": 0.6495659037095501, "acc_stderr,none": 0.013409047676670185}} {"created_at": "2025-05-02T10:20:46.748451", "global_step": 470000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.41638225255972694, "acc_stderr,none": 0.014405618279436165, "acc_norm,none": 0.4402730375426621, "acc_norm_stderr,none": 0.014506769524804244}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7255892255892256, "acc_stderr,none": 0.00915617712224453, "acc_norm,none": 0.7163299663299664, "acc_norm_stderr,none": 0.009249781691140749}, "boolq": {"alias": "boolq", "acc,none": 0.7345565749235474, "acc_stderr,none": 0.0077230909835904705}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.3013923013923014, "acc_stderr,none": 0.013137201028519573}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816505}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.472814180442143, "acc_stderr,none": 0.004982400368939672, "acc_norm,none": 0.6405098585939056, "acc_norm_stderr,none": 0.004788703173474758}, "mmlu": {"acc,none": 0.35457911978350665, "acc_stderr,none": 0.004000324451491185, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.34070138150903295, "acc_stderr,none": 0.006836008246130835, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047182}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.42424242424242425, "acc_stderr,none": 0.038592681420702615}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.4852941176470588, "acc_stderr,none": 0.03507793834791324}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.510548523206751, "acc_stderr,none": 0.032539983791662855}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4462809917355372, "acc_stderr,none": 0.04537935177947879}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.047128212574267705}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3312883435582822, "acc_stderr,none": 0.03697983910025588}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.02494679222527231}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25027932960893856, "acc_stderr,none": 0.01448750085285041}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.36977491961414793, "acc_stderr,none": 0.027417996705631}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3950617283950617, "acc_stderr,none": 0.027201117666925657}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3246414602346806, "acc_stderr,none": 0.011959089388530022}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3684210526315789, "acc_stderr,none": 0.036996580176568775}, "mmlu_other": {"acc,none": 0.3881557772771162, "acc_stderr,none": 0.008719265287330627, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.37358490566037733, "acc_stderr,none": 0.029773082713319875}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3583815028901734, "acc_stderr,none": 0.036563436533531585}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4170403587443946, "acc_stderr,none": 0.03309266936071721}, "mmlu_management": {"alias": " - management", "acc,none": 0.4368932038834951, "acc_stderr,none": 0.04911147107365777}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.47863247863247865, "acc_stderr,none": 0.032726164476349545}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.42, "acc_stderr,none": 0.049604496374885836}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.42528735632183906, "acc_stderr,none": 0.01767922548943145}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3758169934640523, "acc_stderr,none": 0.027732834353363933}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.31560283687943264, "acc_stderr,none": 0.02772498944950931}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.34558823529411764, "acc_stderr,none": 0.028888193103988637}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.036293353299478595}, "mmlu_social_sciences": {"acc,none": 0.37796555086122846, "acc_stderr,none": 0.008679338312563796, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.04227054451232199}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.033586181457325226}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.45595854922279794, "acc_stderr,none": 0.03594413711272438}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3282051282051282, "acc_stderr,none": 0.023807633198657266}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31932773109243695, "acc_stderr,none": 0.030283995525884396}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.42935779816513764, "acc_stderr,none": 0.021222286397236508}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3282442748091603, "acc_stderr,none": 0.041184385658062976}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3660130718954248, "acc_stderr,none": 0.019488025745529675}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04607582090719976}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3142857142857143, "acc_stderr,none": 0.029719329422417475}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5074626865671642, "acc_stderr,none": 0.03535140084276719}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.53, "acc_stderr,none": 0.05016135580465919}, "mmlu_stem": {"acc,none": 0.31937836980653345, "acc_stderr,none": 0.008232047495669444, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.04244633238353228}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.24342105263157895, "acc_stderr,none": 0.034923496688842384}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3680555555555556, "acc_stderr,none": 0.040329990539607195}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.043364327079931785}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.35319148936170214, "acc_stderr,none": 0.031245325202761926}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.30344827586206896, "acc_stderr,none": 0.038312260488503336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.02278967314577657}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.43870967741935485, "acc_stderr,none": 0.028229497320317216}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.031447125816782426}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.02564410863926763}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.33774834437086093, "acc_stderr,none": 0.038615575462551684}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.030058202704309846}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.35714285714285715, "acc_stderr,none": 0.04547960999764376}, "mmlu_pro": {"exact_match,custom-extract": 0.1825132978723404, "exact_match_stderr,custom-extract": 0.0034866040316691178, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3207810320781032, "exact_match_stderr,custom-extract": 0.017444267260255466}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.15335868187579213, "exact_match_stderr,custom-extract": 0.012836327957910565}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.12014134275618374, "exact_match_stderr,custom-extract": 0.00966766057195361}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.20243902439024392, "exact_match_stderr,custom-extract": 0.019868606646141387}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24881516587677724, "exact_match_stderr,custom-extract": 0.014890114672302446}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.16718266253869968, "exact_match_stderr,custom-extract": 0.011993137667893424}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.23227383863080683, "exact_match_stderr,custom-extract": 0.014773802549131708}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1732283464566929, "exact_match_stderr,custom-extract": 0.019413796399290077}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.12806539509536785, "exact_match_stderr,custom-extract": 0.010075381773702279}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14655810510732792, "exact_match_stderr,custom-extract": 0.009625538058445095}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1699134199134199, "exact_match_stderr,custom-extract": 0.012361599999468283}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.17034068136272545, "exact_match_stderr,custom-extract": 0.016845907683885487}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15473441108545036, "exact_match_stderr,custom-extract": 0.010038127358043915}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2706766917293233, "exact_match_stderr,custom-extract": 0.015738245510500747}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.262, "acc_stderr,none": 0.019684688820194713, "acc_norm,none": 0.392, "acc_norm_stderr,none": 0.021854684955611266}, "piqa": {"alias": "piqa", "acc,none": 0.7257889009793254, "acc_stderr,none": 0.010408618664933384, "acc_norm,none": 0.721436343852013, "acc_norm_stderr,none": 0.010459397235965171}, "race": {"alias": "race", "acc,none": 0.37799043062200954, "acc_stderr,none": 0.015006820447473675}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4467758444216991, "acc_stderr,none": 0.011249786691110373}, "winogrande": {"alias": "winogrande", "acc,none": 0.6471981057616417, "acc_stderr,none": 0.01342972810178896}} {"created_at": "2025-05-02T11:36:12.463427", "global_step": 472000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4206484641638225, "acc_stderr,none": 0.014426211252508397, "acc_norm,none": 0.4496587030716723, "acc_norm_stderr,none": 0.01453714444428474}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7209595959595959, "acc_stderr,none": 0.009203588704032633, "acc_norm,none": 0.7133838383838383, "acc_norm_stderr,none": 0.009278551100969297}, "boolq": {"alias": "boolq", "acc,none": 0.7168195718654434, "acc_stderr,none": 0.007880052012351934}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2800982800982801, "acc_stderr,none": 0.012856179020358148}, "copa": {"alias": "copa", "acc,none": 0.78, "acc_stderr,none": 0.04163331998932262}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47161919936267677, "acc_stderr,none": 0.004981736689518745, "acc_norm,none": 0.6394144592710616, "acc_norm_stderr,none": 0.004791890625834205}, "mmlu": {"acc,none": 0.3524426719840479, "acc_stderr,none": 0.004003809971907789, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3375132837407014, "acc_stderr,none": 0.006836277924365067, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4121212121212121, "acc_stderr,none": 0.03843566993588717}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.45098039215686275, "acc_stderr,none": 0.03492406104163613}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4936708860759494, "acc_stderr,none": 0.032544620107678585}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4462809917355372, "acc_stderr,none": 0.04537935177947879}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37962962962962965, "acc_stderr,none": 0.04691521224077742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3558282208588957, "acc_stderr,none": 0.03761521380046734}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.32947976878612717, "acc_stderr,none": 0.02530525813187972}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25251396648044694, "acc_stderr,none": 0.014530330201468667}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.36977491961414793, "acc_stderr,none": 0.027417996705630998}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3765432098765432, "acc_stderr,none": 0.026959344518747787}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3226857887874837, "acc_stderr,none": 0.011940264193195983}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3391812865497076, "acc_stderr,none": 0.036310534964889056}, "mmlu_other": {"acc,none": 0.3871902156420985, "acc_stderr,none": 0.008720764964056132, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001975}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.39245283018867927, "acc_stderr,none": 0.03005258057955784}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3699421965317919, "acc_stderr,none": 0.036812296333943194}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4304932735426009, "acc_stderr,none": 0.033231973029429394}, "mmlu_management": {"alias": " - management", "acc,none": 0.42718446601941745, "acc_stderr,none": 0.04897957737781168}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.43162393162393164, "acc_stderr,none": 0.0324483553531149}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.4, "acc_stderr,none": 0.049236596391733084}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4329501915708812, "acc_stderr,none": 0.017718469101513985}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3431372549019608, "acc_stderr,none": 0.02718449890994162}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.32269503546099293, "acc_stderr,none": 0.027889139300534785}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.33455882352941174, "acc_stderr,none": 0.028661996202335307}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.03629335329947859}, "mmlu_social_sciences": {"acc,none": 0.3675658108547286, "acc_stderr,none": 0.008654662644799452, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.30701754385964913, "acc_stderr,none": 0.043391383225798594}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.03358618145732523}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43005181347150256, "acc_stderr,none": 0.035729543331448094}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.32564102564102565, "acc_stderr,none": 0.02375966576741229}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31512605042016806, "acc_stderr,none": 0.030176808288974337}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3908256880733945, "acc_stderr,none": 0.020920058346111062}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3053435114503817, "acc_stderr,none": 0.0403931497872456}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.36764705882352944, "acc_stderr,none": 0.019506291693954854}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.4, "acc_stderr,none": 0.0469237132203465}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.31020408163265306, "acc_stderr,none": 0.029613459872484378}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.4925373134328358, "acc_stderr,none": 0.035351400842767194}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_stem": {"acc,none": 0.32572153504598794, "acc_stderr,none": 0.008288280135121454, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3851851851851852, "acc_stderr,none": 0.042039210401562783}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640767}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.039812405437178615}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421296}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.52, "acc_stderr,none": 0.05021167315686779}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3617021276595745, "acc_stderr,none": 0.03141082197596239}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03855289616378948}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.023456037383982026}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.43870967741935485, "acc_stderr,none": 0.028229497320317216}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001974}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.027309140588230182}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.32450331125827814, "acc_stderr,none": 0.03822746937658753}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25462962962962965, "acc_stderr,none": 0.02971127586000533}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3482142857142857, "acc_stderr,none": 0.045218299028335865}, "mmlu_pro": {"exact_match,custom-extract": 0.17877327127659576, "exact_match_stderr,custom-extract": 0.00345727968779424, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3263598326359833, "exact_match_stderr,custom-extract": 0.01752289373526119}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.14575411913814956, "exact_match_stderr,custom-extract": 0.01257010068915711}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.1157243816254417, "exact_match_stderr,custom-extract": 0.009512068239624282}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.22926829268292684, "exact_match_stderr,custom-extract": 0.020785570898756725}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24170616113744076, "exact_match_stderr,custom-extract": 0.014745137840103596}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.16821465428276575, "exact_match_stderr,custom-extract": 0.01202264067537525}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2310513447432763, "exact_match_stderr,custom-extract": 0.014746599750625127}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.15748031496062992, "exact_match_stderr,custom-extract": 0.018685785855939545}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1371480472297911, "exact_match_stderr,custom-extract": 0.01037209807700218}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.13323464100666174, "exact_match_stderr,custom-extract": 0.009248950748998359}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1590909090909091, "exact_match_stderr,custom-extract": 0.012039164679107935}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.156312625250501, "exact_match_stderr,custom-extract": 0.01627320637916909}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15704387990762125, "exact_match_stderr,custom-extract": 0.010098936605569611}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2518796992481203, "exact_match_stderr,custom-extract": 0.015376345973657985}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.268, "acc_stderr,none": 0.019827714859587564, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7263329706202394, "acc_stderr,none": 0.01040218420622921, "acc_norm,none": 0.720348204570185, "acc_norm_stderr,none": 0.010471899530306559}, "race": {"alias": "race", "acc,none": 0.384688995215311, "acc_stderr,none": 0.015057468843874157}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4437052200614125, "acc_stderr,none": 0.01124213095108688}, "winogrande": {"alias": "winogrande", "acc,none": 0.6448303078137332, "acc_stderr,none": 0.013450047479569257}} {"created_at": "2025-05-02T13:41:01.079431", "global_step": 474000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.41638225255972694, "acc_stderr,none": 0.014405618279436164, "acc_norm,none": 0.447098976109215, "acc_norm_stderr,none": 0.014529380160526842}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7314814814814815, "acc_stderr,none": 0.009094042554994852, "acc_norm,none": 0.7264309764309764, "acc_norm_stderr,none": 0.009147424438490741}, "boolq": {"alias": "boolq", "acc,none": 0.7568807339449541, "acc_stderr,none": 0.007502671870926002}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.29238329238329236, "acc_stderr,none": 0.013022531002213357}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47679745070703045, "acc_stderr,none": 0.004984405935541096, "acc_norm,none": 0.6415056761601274, "acc_norm_stderr,none": 0.004785781979354876}, "mmlu": {"acc,none": 0.3437544509329155, "acc_stderr,none": 0.003977910012233045, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3279489904357067, "acc_stderr,none": 0.0067863520792915315, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.03455071019102148}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4, "acc_stderr,none": 0.03825460278380026}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46078431372549017, "acc_stderr,none": 0.03498501649369527}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4810126582278481, "acc_stderr,none": 0.03252375148090448}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4049586776859504, "acc_stderr,none": 0.04481137755942469}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37962962962962965, "acc_stderr,none": 0.04691521224077742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3312883435582822, "acc_stderr,none": 0.03697983910025588}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.30346820809248554, "acc_stderr,none": 0.024752411960917202}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24916201117318434, "acc_stderr,none": 0.014465893829859933}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3536977491961415, "acc_stderr,none": 0.02715520810320088}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.36728395061728397, "acc_stderr,none": 0.026822801759507898}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.31877444589308995, "acc_stderr,none": 0.011901895635786103}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.32748538011695905, "acc_stderr,none": 0.035993357714560276}, "mmlu_other": {"acc,none": 0.3746379143868684, "acc_stderr,none": 0.008656673421599967, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.37358490566037733, "acc_stderr,none": 0.02977308271331987}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3815028901734104, "acc_stderr,none": 0.03703851193099521}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.42152466367713004, "acc_stderr,none": 0.033141902221106564}, "mmlu_management": {"alias": " - management", "acc,none": 0.4174757281553398, "acc_stderr,none": 0.048828405482122375}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.452991452991453, "acc_stderr,none": 0.0326109987309862}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.41, "acc_stderr,none": 0.04943110704237102}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4099616858237548, "acc_stderr,none": 0.017587672312336048}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3562091503267974, "acc_stderr,none": 0.027420477662629228}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2907801418439716, "acc_stderr,none": 0.027090664368353178}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3014705882352941, "acc_stderr,none": 0.027875982114273168}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.036293353299478595}, "mmlu_social_sciences": {"acc,none": 0.36789080272993174, "acc_stderr,none": 0.008651734744178444, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.32456140350877194, "acc_stderr,none": 0.044045561573747685}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30808080808080807, "acc_stderr,none": 0.03289477330098615}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43523316062176165, "acc_stderr,none": 0.03578038165008585}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3076923076923077, "acc_stderr,none": 0.02340092891831048}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.33613445378151263, "acc_stderr,none": 0.030684737115135356}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3779816513761468, "acc_stderr,none": 0.020789187066728106}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.31297709923664124, "acc_stderr,none": 0.04066962905677698}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.37745098039215685, "acc_stderr,none": 0.019610851474880276}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.4090909090909091, "acc_stderr,none": 0.04709306978661896}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3183673469387755, "acc_stderr,none": 0.029822533793982062}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.4975124378109453, "acc_stderr,none": 0.03535490150137289}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_stem": {"acc,none": 0.3133523628290517, "acc_stderr,none": 0.008199911742670096, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.37777777777777777, "acc_stderr,none": 0.04188307537595853}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03583496176361063}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3541666666666667, "acc_stderr,none": 0.039994111357535424}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.040925639582376556}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3446808510638298, "acc_stderr,none": 0.03106898596312215}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03855289616378948}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.29365079365079366, "acc_stderr,none": 0.023456037383982022}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.42258064516129035, "acc_stderr,none": 0.02810096472427264}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.031270907132976984}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.39, "acc_stderr,none": 0.04902071300001974}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.026593939101844072}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.304635761589404, "acc_stderr,none": 0.03757949922943342}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046944}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.18060172872340424, "exact_match_stderr,custom-extract": 0.0034757564501199495, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3207810320781032, "exact_match_stderr,custom-extract": 0.017444267260255462}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.15716096324461343, "exact_match_stderr,custom-extract": 0.012965269640409607}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11395759717314488, "exact_match_stderr,custom-extract": 0.009448602796221683}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1902439024390244, "exact_match_stderr,custom-extract": 0.019407555306945948}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.23459715639810427, "exact_match_stderr,custom-extract": 0.014594614234695607}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.17337461300309598, "exact_match_stderr,custom-extract": 0.012167726609185036}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2371638141809291, "exact_match_stderr,custom-extract": 0.01488088730985558}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1679790026246719, "exact_match_stderr,custom-extract": 0.01917797923756725}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1371480472297911, "exact_match_stderr,custom-extract": 0.010372098077002182}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.13989637305699482, "exact_match_stderr,custom-extract": 0.009440863812584696}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.170995670995671, "exact_match_stderr,custom-extract": 0.012392818993997873}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1523046092184369, "exact_match_stderr,custom-extract": 0.016101330436514016}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16551193225558122, "exact_match_stderr,custom-extract": 0.010315430889447998}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.24937343358395989, "exact_match_stderr,custom-extract": 0.01532526162657071}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.268, "acc_stderr,none": 0.019827714859587564, "acc_norm,none": 0.394, "acc_norm_stderr,none": 0.02187429930168925}, "piqa": {"alias": "piqa", "acc,none": 0.7285092491838956, "acc_stderr,none": 0.010376251176596135, "acc_norm,none": 0.7225244831338411, "acc_norm_stderr,none": 0.010446818281039947}, "race": {"alias": "race", "acc,none": 0.3722488038277512, "acc_stderr,none": 0.014960984760899331}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44933469805527126, "acc_stderr,none": 0.011255834856639545}, "winogrande": {"alias": "winogrande", "acc,none": 0.6495659037095501, "acc_stderr,none": 0.013409047676670187}} {"created_at": "2025-05-02T15:37:15.168252", "global_step": 476000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4138225255972696, "acc_stderr,none": 0.014392730009221014, "acc_norm,none": 0.4462457337883959, "acc_norm_stderr,none": 0.014526705548539982}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7331649831649831, "acc_stderr,none": 0.00907591585926726, "acc_norm,none": 0.7171717171717171, "acc_norm_stderr,none": 0.009241472775328228}, "boolq": {"alias": "boolq", "acc,none": 0.754434250764526, "acc_stderr,none": 0.0075281304213486}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.27436527436527436, "acc_stderr,none": 0.012774493368021556}, "copa": {"alias": "copa", "acc,none": 0.8, "acc_stderr,none": 0.04020151261036845}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4738099980083649, "acc_stderr,none": 0.00498293156594595, "acc_norm,none": 0.6426010754829715, "acc_norm_stderr,none": 0.0047825427541020775}, "mmlu": {"acc,none": 0.34346959122632104, "acc_stderr,none": 0.003975969589776817, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3239107332624867, "acc_stderr,none": 0.00676539466524145, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1746031746031746, "acc_stderr,none": 0.033954900208561116}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.3878787878787879, "acc_stderr,none": 0.038049136539710114}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.4166666666666667, "acc_stderr,none": 0.03460228327239171}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.46835443037974683, "acc_stderr,none": 0.03248197400511075}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4380165289256198, "acc_stderr,none": 0.04529146804435792}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.04668408033024932}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.32515337423312884, "acc_stderr,none": 0.036803503712864595}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.33236994219653176, "acc_stderr,none": 0.025361168749688225}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.014265554192331158}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.36012861736334406, "acc_stderr,none": 0.02726429759980402}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3734567901234568, "acc_stderr,none": 0.02691500301138015}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3155149934810952, "acc_stderr,none": 0.011869184843058638}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.034886477134579215}, "mmlu_other": {"acc,none": 0.37753459929192146, "acc_stderr,none": 0.008679474188618686, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.37358490566037733, "acc_stderr,none": 0.029773082713319875}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3930635838150289, "acc_stderr,none": 0.0372424959581773}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4260089686098655, "acc_stderr,none": 0.0331883328621728}, "mmlu_management": {"alias": " - management", "acc,none": 0.4077669902912621, "acc_stderr,none": 0.04865777570410769}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4188034188034188, "acc_stderr,none": 0.03232128912157792}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.42, "acc_stderr,none": 0.049604496374885836}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4125159642401022, "acc_stderr,none": 0.017604149108671936}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.027363593284684934}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2978723404255319, "acc_stderr,none": 0.027281608344469417}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.028418208619406794}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3192771084337349, "acc_stderr,none": 0.03629335329947859}, "mmlu_social_sciences": {"acc,none": 0.37341566460838477, "acc_stderr,none": 0.008672516670829983, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2807017543859649, "acc_stderr,none": 0.04227054451232199}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.31313131313131315, "acc_stderr,none": 0.033042050878136525}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.42487046632124353, "acc_stderr,none": 0.035674713352125395}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.34615384615384615, "acc_stderr,none": 0.024121125416941197}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31512605042016806, "acc_stderr,none": 0.030176808288974337}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.42935779816513764, "acc_stderr,none": 0.02122228639723651}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467766}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.36764705882352944, "acc_stderr,none": 0.019506291693954854}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.4090909090909091, "acc_stderr,none": 0.04709306978661896}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.31020408163265306, "acc_stderr,none": 0.029613459872484375}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48756218905472637, "acc_stderr,none": 0.0353443984853958}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_stem": {"acc,none": 0.30986362194735173, "acc_stderr,none": 0.008175461382828939, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.35555555555555557, "acc_stderr,none": 0.04135176749720386}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2565789473684211, "acc_stderr,none": 0.0355418036802569}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.04016660030451233}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.030976692998534436}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.03752833958003337}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.02286083830923207}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.432258064516129, "acc_stderr,none": 0.028181739720019406}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.27586206896551724, "acc_stderr,none": 0.03144712581678242}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.02730914058823019}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31125827814569534, "acc_stderr,none": 0.037804458505267334}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863445}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.1819315159574468, "exact_match_stderr,custom-extract": 0.0034786397916285973, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3472803347280335, "exact_match_stderr,custom-extract": 0.017792908002753113}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.155893536121673, "exact_match_stderr,custom-extract": 0.012922589786819783}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11749116607773852, "exact_match_stderr,custom-extract": 0.0095748247841722}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.2121951219512195, "exact_match_stderr,custom-extract": 0.02021693788475414}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.235781990521327, "exact_match_stderr,custom-extract": 0.014620093887364651}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.14447884416924664, "exact_match_stderr,custom-extract": 0.011300036008717557}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.24083129584352078, "exact_match_stderr,custom-extract": 0.01495941399445001}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1679790026246719, "exact_match_stderr,custom-extract": 0.01917797923756725}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.13896457765667575, "exact_match_stderr,custom-extract": 0.01042956565875803}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14581791265729088, "exact_match_stderr,custom-extract": 0.009605363046932724}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.16883116883116883, "exact_match_stderr,custom-extract": 0.012330199046169844}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15430861723446893, "exact_match_stderr,custom-extract": 0.01618774561809322}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1624326404926867, "exact_match_stderr,custom-extract": 0.010237859802710476}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2543859649122807, "exact_match_stderr,custom-extract": 0.015426750291360192}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.262, "acc_stderr,none": 0.019684688820194716, "acc_norm,none": 0.398, "acc_norm_stderr,none": 0.021912377885779974}, "piqa": {"alias": "piqa", "acc,none": 0.7257889009793254, "acc_stderr,none": 0.010408618664933382, "acc_norm,none": 0.7219804134929271, "acc_norm_stderr,none": 0.010453117358332811}, "race": {"alias": "race", "acc,none": 0.3770334928229665, "acc_stderr,none": 0.014999337089843353}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44575230296827023, "acc_stderr,none": 0.01124728305057907}, "winogrande": {"alias": "winogrande", "acc,none": 0.6550907655880032, "acc_stderr,none": 0.013359379805033685}} {"created_at": "2025-05-02T18:57:37.311645", "global_step": 478000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4087030716723549, "acc_stderr,none": 0.014365750345427, "acc_norm,none": 0.4377133105802048, "acc_norm_stderr,none": 0.014497573881108288}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7306397306397306, "acc_stderr,none": 0.009103043207756985, "acc_norm,none": 0.7163299663299664, "acc_norm_stderr,none": 0.009249781691140749}, "boolq": {"alias": "boolq", "acc,none": 0.7608562691131499, "acc_stderr,none": 0.007460593054147722}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2809172809172809, "acc_stderr,none": 0.012867635159174046}, "copa": {"alias": "copa", "acc,none": 0.79, "acc_stderr,none": 0.04093601807403326}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47560246962756425, "acc_stderr,none": 0.0049838376415028965, "acc_norm,none": 0.6411073491336388, "acc_norm_stderr,none": 0.004786953146657073}, "mmlu": {"acc,none": 0.3495228599914542, "acc_stderr,none": 0.003986488829962639, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3279489904357067, "acc_stderr,none": 0.00676426789343054, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.035670166752768635}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.42424242424242425, "acc_stderr,none": 0.038592681420702615}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.4411764705882353, "acc_stderr,none": 0.034849415144292316}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5063291139240507, "acc_stderr,none": 0.03254462010767859}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.47107438016528924, "acc_stderr,none": 0.04556710331269498}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.35185185185185186, "acc_stderr,none": 0.04616631111801713}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3558282208588957, "acc_stderr,none": 0.03761521380046734}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3468208092485549, "acc_stderr,none": 0.025624723994030454}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23575418994413408, "acc_stderr,none": 0.014196375686290804}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3858520900321543, "acc_stderr,none": 0.027648149599751464}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.36728395061728397, "acc_stderr,none": 0.0268228017595079}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30182529335071706, "acc_stderr,none": 0.01172435051810589}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30409356725146197, "acc_stderr,none": 0.03528211258245233}, "mmlu_other": {"acc,none": 0.3894431927904731, "acc_stderr,none": 0.008725458570977382, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3849056603773585, "acc_stderr,none": 0.02994649856769995}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.37572254335260113, "acc_stderr,none": 0.036928207672648664}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4170403587443946, "acc_stderr,none": 0.03309266936071721}, "mmlu_management": {"alias": " - management", "acc,none": 0.4368932038834951, "acc_stderr,none": 0.04911147107365777}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4358974358974359, "acc_stderr,none": 0.032485775115784}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.43, "acc_stderr,none": 0.04975698519562428}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.44189016602809705, "acc_stderr,none": 0.017758800534214414}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.027363593284684948}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3120567375886525, "acc_stderr,none": 0.02764012054516993}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.34558823529411764, "acc_stderr,none": 0.028888193103988633}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3253012048192771, "acc_stderr,none": 0.03647168523683227}, "mmlu_social_sciences": {"acc,none": 0.3756906077348066, "acc_stderr,none": 0.008678654564680251, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.32456140350877194, "acc_stderr,none": 0.04404556157374768}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3434343434343434, "acc_stderr,none": 0.03383201223244442}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.44041450777202074, "acc_stderr,none": 0.03582724530036093}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.31025641025641026, "acc_stderr,none": 0.023454674889404288}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31512605042016806, "acc_stderr,none": 0.030176808288974333}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.41834862385321103, "acc_stderr,none": 0.021149548596443888}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3282442748091603, "acc_stderr,none": 0.041184385658062976}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3758169934640523, "acc_stderr,none": 0.019594021136577447}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.38181818181818183, "acc_stderr,none": 0.04653429807913508}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.30612244897959184, "acc_stderr,none": 0.02950489645459596}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.5024875621890548, "acc_stderr,none": 0.03535490150137288}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.51, "acc_stderr,none": 0.05024183937956912}, "mmlu_stem": {"acc,none": 0.3168411037107517, "acc_stderr,none": 0.008225359090731642, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4, "acc_stderr,none": 0.042320736951515885}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.03715062154998905}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3680555555555556, "acc_stderr,none": 0.040329990539607195}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993177}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33617021276595743, "acc_stderr,none": 0.030881618520676942}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.03780019230438014}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2698412698412698, "acc_stderr,none": 0.02286083830923207}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.44516129032258067, "acc_stderr,none": 0.028272410186214906}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.031785297106427496}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.37, "acc_stderr,none": 0.04852365870939099}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2851851851851852, "acc_stderr,none": 0.027528599210340492}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2913907284768212, "acc_stderr,none": 0.03710185726119995}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863445}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.1813497340425532, "exact_match_stderr,custom-extract": 0.00347829757922805, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3263598326359833, "exact_match_stderr,custom-extract": 0.017522893735261183}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.15462610899873258, "exact_match_stderr,custom-extract": 0.012879610229477286}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10424028268551237, "exact_match_stderr,custom-extract": 0.009086199159417191}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.20243902439024392, "exact_match_stderr,custom-extract": 0.01986860664614139}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24170616113744076, "exact_match_stderr,custom-extract": 0.014745137840103595}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1692466460268318, "exact_match_stderr,custom-extract": 0.012051980172143208}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.23838630806845965, "exact_match_stderr,custom-extract": 0.014907231542341415}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16010498687664043, "exact_match_stderr,custom-extract": 0.018811487255752723}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.14168937329700274, "exact_match_stderr,custom-extract": 0.010514643243492041}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.15099925980754997, "exact_match_stderr,custom-extract": 0.009744836556602878}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.170995670995671, "exact_match_stderr,custom-extract": 0.01239281899399788}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14428857715430862, "exact_match_stderr,custom-extract": 0.01574580862551284}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16166281755196305, "exact_match_stderr,custom-extract": 0.010218263317126072}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2518796992481203, "exact_match_stderr,custom-extract": 0.01537634597365798}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.274, "acc_stderr,none": 0.019966103540279466, "acc_norm,none": 0.4, "acc_norm_stderr,none": 0.021930844120728505}, "piqa": {"alias": "piqa", "acc,none": 0.7247007616974973, "acc_stderr,none": 0.01042142927736953, "acc_norm,none": 0.720892274211099, "acc_norm_stderr,none": 0.010465657948498231}, "race": {"alias": "race", "acc,none": 0.3799043062200957, "acc_stderr,none": 0.01502160080493565}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45291709314227224, "acc_stderr,none": 0.01126379679411243}, "winogrande": {"alias": "winogrande", "acc,none": 0.6527229676400947, "acc_stderr,none": 0.013380909249751244}} {"created_at": "2025-05-02T19:04:04.844533", "global_step": 480000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4180887372013652, "acc_stderr,none": 0.014413988396996088, "acc_norm,none": 0.439419795221843, "acc_norm_stderr,none": 0.014503747823580125}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7285353535353535, "acc_stderr,none": 0.009125362970360625, "acc_norm,none": 0.7150673400673401, "acc_norm_stderr,none": 0.009262170695590656}, "boolq": {"alias": "boolq", "acc,none": 0.771559633027523, "acc_stderr,none": 0.007342834051148581}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.27354627354627353, "acc_stderr,none": 0.012762608994259371}, "copa": {"alias": "copa", "acc,none": 0.79, "acc_stderr,none": 0.04093601807403326}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.476000796654053, "acc_stderr,none": 0.004984030250507296, "acc_norm,none": 0.6428002389962159, "acc_norm_stderr,none": 0.004781950883460504}, "mmlu": {"acc,none": 0.34147557328015954, "acc_stderr,none": 0.003967972569487562, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.32688629117959617, "acc_stderr,none": 0.006777844982407219, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047182}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4, "acc_stderr,none": 0.038254602783800246}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.45588235294117646, "acc_stderr,none": 0.03495624522015473}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.48523206751054854, "acc_stderr,none": 0.032533028078777386}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4214876033057851, "acc_stderr,none": 0.04507732278775094}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.04643454608906274}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.34355828220858897, "acc_stderr,none": 0.03731133519673891}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3236994219653179, "acc_stderr,none": 0.025190181327608415}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.36012861736334406, "acc_stderr,none": 0.027264297599804015}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3765432098765432, "acc_stderr,none": 0.026959344518747787}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30964797913950454, "acc_stderr,none": 0.011808598262503318}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30994152046783624, "acc_stderr,none": 0.035469769593931624}, "mmlu_other": {"acc,none": 0.3804312841969746, "acc_stderr,none": 0.008681854364792048, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.37735849056603776, "acc_stderr,none": 0.029832808114796005}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3930635838150289, "acc_stderr,none": 0.0372424959581773}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.046882617226215034}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.42152466367713004, "acc_stderr,none": 0.03314190222110656}, "mmlu_management": {"alias": " - management", "acc,none": 0.4077669902912621, "acc_stderr,none": 0.04865777570410769}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.44871794871794873, "acc_stderr,none": 0.0325833464938688}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.42017879948914433, "acc_stderr,none": 0.017650651363078012}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.027363593284684934}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2801418439716312, "acc_stderr,none": 0.026789172351140245}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.02841820861940679}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3433734939759036, "acc_stderr,none": 0.036965843170106004}, "mmlu_social_sciences": {"acc,none": 0.368540786480338, "acc_stderr,none": 0.008663159748711824, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.04372748290278008}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070644}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43005181347150256, "acc_stderr,none": 0.035729543331448094}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3282051282051282, "acc_stderr,none": 0.023807633198657262}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3445378151260504, "acc_stderr,none": 0.030868682604121626}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3944954128440367, "acc_stderr,none": 0.020954642108587485}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3282442748091603, "acc_stderr,none": 0.041184385658062976}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.35947712418300654, "acc_stderr,none": 0.01941253924203216}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.4, "acc_stderr,none": 0.0469237132203465}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2979591836734694, "acc_stderr,none": 0.02927956741106568}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48258706467661694, "acc_stderr,none": 0.035333892347392454}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_stem": {"acc,none": 0.29844592451633367, "acc_stderr,none": 0.008081681519214772, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384739}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.362962962962963, "acc_stderr,none": 0.04153948404742399}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.03690677986137282}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3194444444444444, "acc_stderr,none": 0.03899073687357335}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.030976692998534446}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.03695183311650232}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.022569897074918417}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4290322580645161, "acc_stderr,none": 0.02815603653823321}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.03194740072265541}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02671924078371217}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987053}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422256}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.0443280405529152}, "mmlu_pro": {"exact_match,custom-extract": 0.18035239361702127, "exact_match_stderr,custom-extract": 0.0034727854149345185, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.32217573221757323, "exact_match_stderr,custom-extract": 0.01746419040868442}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.1596958174904943, "exact_match_stderr,custom-extract": 0.01304974197804603}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10777385159010601, "exact_match_stderr,custom-extract": 0.00922067835765709}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.1926829268292683, "exact_match_stderr,custom-extract": 0.019502129313734493}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24407582938388625, "exact_match_stderr,custom-extract": 0.01479407157778744}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.15479876160990713, "exact_match_stderr,custom-extract": 0.011625887729987494}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2273838630806846, "exact_match_stderr,custom-extract": 0.01466394014668951}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16272965879265092, "exact_match_stderr,custom-extract": 0.018935396882827783}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.13533151680290645, "exact_match_stderr,custom-extract": 0.01031401946878533}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.154700222057735, "exact_match_stderr,custom-extract": 0.009842013620867987}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.16774891774891776, "exact_match_stderr,custom-extract": 0.01229861474036573}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1523046092184369, "exact_match_stderr,custom-extract": 0.01610133043651402}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16782140107775212, "exact_match_stderr,custom-extract": 0.010372766376157203}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2518796992481203, "exact_match_stderr,custom-extract": 0.015376345973657985}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.274, "acc_stderr,none": 0.019966103540279462, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7268770402611534, "acc_stderr,none": 0.010395730264453267, "acc_norm,none": 0.7230685527747551, "acc_norm_stderr,none": 0.010440499969334535}, "race": {"alias": "race", "acc,none": 0.3712918660287081, "acc_stderr,none": 0.01495312651508941}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44575230296827023, "acc_stderr,none": 0.01124728305057907}, "winogrande": {"alias": "winogrande", "acc,none": 0.6527229676400947, "acc_stderr,none": 0.013380909249751246}} {"created_at": "2025-05-02T20:41:11.508158", "global_step": 482000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4052901023890785, "acc_stderr,none": 0.014346869060229334, "acc_norm,none": 0.43686006825938567, "acc_norm_stderr,none": 0.014494421584256515}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7281144781144782, "acc_stderr,none": 0.009129795867310494, "acc_norm,none": 0.7138047138047138, "acc_norm_stderr,none": 0.009274470774627728}, "boolq": {"alias": "boolq", "acc,none": 0.7672782874617737, "acc_stderr,none": 0.007390731859680137}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2932022932022932, "acc_stderr,none": 0.0130332081673615}, "copa": {"alias": "copa", "acc,none": 0.78, "acc_stderr,none": 0.041633319989322626}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47450707030472017, "acc_stderr,none": 0.0049832915782890425, "acc_norm,none": 0.6401115315674168, "acc_norm_stderr,none": 0.004789865379084506}, "mmlu": {"acc,none": 0.3344252955419456, "acc_stderr,none": 0.0039490781729772245, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3241232731137088, "acc_stderr,none": 0.006754286113231627, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4, "acc_stderr,none": 0.038254602783800246}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46568627450980393, "acc_stderr,none": 0.03501038327635897}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5021097046413502, "acc_stderr,none": 0.032546938018020076}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4214876033057851, "acc_stderr,none": 0.045077322787750944}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.39814814814814814, "acc_stderr,none": 0.04732332615978815}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3558282208588957, "acc_stderr,none": 0.03761521380046734}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3236994219653179, "acc_stderr,none": 0.025190181327608422}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2435754189944134, "acc_stderr,none": 0.014355911964767864}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3247588424437299, "acc_stderr,none": 0.026596782287697043}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3765432098765432, "acc_stderr,none": 0.026959344518747784}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.29986962190352023, "acc_stderr,none": 0.011702660860193998}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30994152046783624, "acc_stderr,none": 0.03546976959393163}, "mmlu_other": {"acc,none": 0.3723849372384937, "acc_stderr,none": 0.008638494454770823, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.41, "acc_stderr,none": 0.049431107042371025}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.36981132075471695, "acc_stderr,none": 0.029711421880107922}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3699421965317919, "acc_stderr,none": 0.03681229633394319}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4349775784753363, "acc_stderr,none": 0.033272833702713445}, "mmlu_management": {"alias": " - management", "acc,none": 0.3592233009708738, "acc_stderr,none": 0.047504583990416946}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.4444444444444444, "acc_stderr,none": 0.03255326307272485}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4086845466155811, "acc_stderr,none": 0.017579250148153393}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.32679738562091504, "acc_stderr,none": 0.02685729466328142}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2695035460992908, "acc_stderr,none": 0.02646903681859063}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.30514705882352944, "acc_stderr,none": 0.0279715413701706}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3614457831325301, "acc_stderr,none": 0.03740059382029321}, "mmlu_social_sciences": {"acc,none": 0.3493662658433539, "acc_stderr,none": 0.00856634801852034, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.30701754385964913, "acc_stderr,none": 0.0433913832257986}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30303030303030304, "acc_stderr,none": 0.032742879140268674}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.39378238341968913, "acc_stderr,none": 0.03526077095548237}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3128205128205128, "acc_stderr,none": 0.023507579020645365}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.029597329730978082}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.363302752293578, "acc_stderr,none": 0.020620603919625807}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.31297709923664124, "acc_stderr,none": 0.04066962905677698}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3660130718954248, "acc_stderr,none": 0.019488025745529665}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.04631381319425464}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0289205832206756}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.4527363184079602, "acc_stderr,none": 0.03519702717576915}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_stem": {"acc,none": 0.2978116079923882, "acc_stderr,none": 0.008085516897590603, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.37777777777777777, "acc_stderr,none": 0.04188307537595853}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.03583496176361063}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.039420826399272135}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.34893617021276596, "acc_stderr,none": 0.031158522131357773}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23544973544973544, "acc_stderr,none": 0.021851509822031708}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4032258064516129, "acc_stderr,none": 0.02790615082604114}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.0319474007226554}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.027080372815145668}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.034791855725996586}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046934}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285714}, "mmlu_pro": {"exact_match,custom-extract": 0.18276263297872342, "exact_match_stderr,custom-extract": 0.0034949896205120033, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3054393305439331, "exact_match_stderr,custom-extract": 0.017213178087194265}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.15462610899873258, "exact_match_stderr,custom-extract": 0.01287961022947729}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.1166077738515901, "exact_match_stderr,custom-extract": 0.009543534246504533}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.2048780487804878, "exact_match_stderr,custom-extract": 0.01995735269083492}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.23933649289099526, "exact_match_stderr,custom-extract": 0.014695587900810768}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1651186790505676, "exact_match_stderr,custom-extract": 0.0119336362631826}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2310513447432763, "exact_match_stderr,custom-extract": 0.014746599750625127}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1889763779527559, "exact_match_stderr,custom-extract": 0.02008300581197246}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.13533151680290645, "exact_match_stderr,custom-extract": 0.010314019468785329}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.16506291635825315, "exact_match_stderr,custom-extract": 0.010103800165231483}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.15584415584415584, "exact_match_stderr,custom-extract": 0.011938663890309693}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1523046092184369, "exact_match_stderr,custom-extract": 0.016101330436514016}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16782140107775212, "exact_match_stderr,custom-extract": 0.010372766376157212}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2619047619047619, "exact_match_stderr,custom-extract": 0.015573948649345878}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.272, "acc_stderr,none": 0.019920483209566072, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7279651795429815, "acc_stderr,none": 0.01038276378624738, "acc_norm,none": 0.7219804134929271, "acc_norm_stderr,none": 0.010453117358332811}, "race": {"alias": "race", "acc,none": 0.3751196172248804, "acc_stderr,none": 0.014984183551431952}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.44779938587512796, "acc_stderr,none": 0.011252242102001767}, "winogrande": {"alias": "winogrande", "acc,none": 0.6629834254143646, "acc_stderr,none": 0.01328495576939525}} {"created_at": "2025-05-02T22:30:13.812873", "global_step": 484000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4129692832764505, "acc_stderr,none": 0.014388344935398324, "acc_norm,none": 0.44283276450511944, "acc_norm_stderr,none": 0.014515573873348907}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.734006734006734, "acc_stderr,none": 0.009066789565615694, "acc_norm,none": 0.7192760942760943, "acc_norm_stderr,none": 0.009220526174711356}, "boolq": {"alias": "boolq", "acc,none": 0.7694189602446483, "acc_stderr,none": 0.007366917025520441}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.28501228501228504, "acc_stderr,none": 0.012924125439047213}, "copa": {"alias": "copa", "acc,none": 0.79, "acc_stderr,none": 0.04093601807403326}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47460665206134234, "acc_stderr,none": 0.004983342213776258, "acc_norm,none": 0.6410077673770165, "acc_norm_stderr,none": 0.0047872453779671045}, "mmlu": {"acc,none": 0.3336419313488107, "acc_stderr,none": 0.0039438524879566, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3249734325185972, "acc_stderr,none": 0.0067554867542048945, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.03455071019102148}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4121212121212121, "acc_stderr,none": 0.038435669935887165}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46078431372549017, "acc_stderr,none": 0.03498501649369527}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.48945147679324896, "acc_stderr,none": 0.032539983791662855}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.45454545454545453, "acc_stderr,none": 0.04545454545454546}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.047500773411999854}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3619631901840491, "acc_stderr,none": 0.037757007291414416}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3208092485549133, "acc_stderr,none": 0.0251310002336479}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3440514469453376, "acc_stderr,none": 0.026981478043648022}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.36728395061728397, "acc_stderr,none": 0.026822801759507898}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3011734028683181, "acc_stderr,none": 0.011717148751648431}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.035650796707083106}, "mmlu_other": {"acc,none": 0.3746379143868684, "acc_stderr,none": 0.008656226181610889, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.37358490566037733, "acc_stderr,none": 0.029773082713319875}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3872832369942196, "acc_stderr,none": 0.037143259063020656}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4349775784753363, "acc_stderr,none": 0.033272833702713445}, "mmlu_management": {"alias": " - management", "acc,none": 0.34951456310679613, "acc_stderr,none": 0.047211885060971716}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.43162393162393164, "acc_stderr,none": 0.0324483553531149}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.43, "acc_stderr,none": 0.04975698519562428}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4163473818646232, "acc_stderr,none": 0.017627948030430298}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.32679738562091504, "acc_stderr,none": 0.026857294663281413}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.30514705882352944, "acc_stderr,none": 0.027971541370170598}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3614457831325301, "acc_stderr,none": 0.03740059382029321}, "mmlu_social_sciences": {"acc,none": 0.3457913552161196, "acc_stderr,none": 0.008543185295853226, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.30701754385964913, "acc_stderr,none": 0.0433913832257986}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.31313131313131315, "acc_stderr,none": 0.033042050878136525}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.39896373056994816, "acc_stderr,none": 0.03533999094065696}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.29743589743589743, "acc_stderr,none": 0.023177408131465956}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.02934457250063434}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3504587155963303, "acc_stderr,none": 0.020456077599824457}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2900763358778626, "acc_stderr,none": 0.03980066246467766}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.36764705882352944, "acc_stderr,none": 0.019506291693954847}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04607582090719976}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879818}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.4626865671641791, "acc_stderr,none": 0.035256751674679745}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.45, "acc_stderr,none": 0.05}, "mmlu_stem": {"acc,none": 0.2943228671106882, "acc_stderr,none": 0.008037384970229454, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.04244633238353228}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.26973684210526316, "acc_stderr,none": 0.03611780560284898}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3402777777777778, "acc_stderr,none": 0.03962135573486219}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.48, "acc_stderr,none": 0.05021167315686781}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.030976692998534443}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24338624338624337, "acc_stderr,none": 0.022101128787415426}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4096774193548387, "acc_stderr,none": 0.027976054915347357}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.0319474007226554}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.026466117538959912}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.25165562913907286, "acc_stderr,none": 0.03543304234389985}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.19444444444444445, "acc_stderr,none": 0.02699145450203672}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.18392619680851063, "exact_match_stderr,custom-extract": 0.0034958263771979322, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3305439330543933, "exact_match_stderr,custom-extract": 0.01758001028718085}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16476552598225602, "exact_match_stderr,custom-extract": 0.013215216167850041}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10335689045936396, "exact_match_stderr,custom-extract": 0.009052076648374284}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.2097560975609756, "exact_match_stderr,custom-extract": 0.020131503920840902}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.25118483412322273, "exact_match_stderr,custom-extract": 0.014937235759500112}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.16305469556243551, "exact_match_stderr,custom-extract": 0.011873466052186893}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2273838630806846, "exact_match_stderr,custom-extract": 0.014663940146689517}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1837270341207349, "exact_match_stderr,custom-extract": 0.01986609191654633}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.14259763851044505, "exact_match_stderr,custom-extract": 0.010542707604685755}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14581791265729088, "exact_match_stderr,custom-extract": 0.009605363046932713}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17207792207792208, "exact_match_stderr,custom-extract": 0.012423857401451374}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.13627254509018036, "exact_match_stderr,custom-extract": 0.015373681322287381}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1762894534257121, "exact_match_stderr,custom-extract": 0.010577015303223901}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2593984962406015, "exact_match_stderr,custom-extract": 0.015525545242193468}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.27, "acc_stderr,none": 0.01987435483128748, "acc_norm,none": 0.39, "acc_norm_stderr,none": 0.02183468586936921}, "piqa": {"alias": "piqa", "acc,none": 0.7290533188248096, "acc_stderr,none": 0.010369718937426843, "acc_norm,none": 0.7219804134929271, "acc_norm_stderr,none": 0.010453117358332811}, "race": {"alias": "race", "acc,none": 0.37799043062200954, "acc_stderr,none": 0.015006820447473677}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4508700102354145, "acc_stderr,none": 0.011259319269273937}, "winogrande": {"alias": "winogrande", "acc,none": 0.65982636148382, "acc_stderr,none": 0.013315218762417395}} {"created_at": "2025-05-03T02:20:02.564155", "global_step": 486000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.40784982935153585, "acc_stderr,none": 0.01436109728844969, "acc_norm,none": 0.43856655290102387, "acc_norm_stderr,none": 0.014500682618212864}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7331649831649831, "acc_stderr,none": 0.00907591585926726, "acc_norm,none": 0.7209595959595959, "acc_norm_stderr,none": 0.009203588704032635}, "boolq": {"alias": "boolq", "acc,none": 0.7703363914373089, "acc_stderr,none": 0.0073566287371636604}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.29238329238329236, "acc_stderr,none": 0.013022531002213357}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47540330611431986, "acc_stderr,none": 0.004983740145218612, "acc_norm,none": 0.6413065126468831, "acc_norm_stderr,none": 0.004786368011500453}, "mmlu": {"acc,none": 0.3397664150405925, "acc_stderr,none": 0.003965236208796706, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3277364505844846, "acc_stderr,none": 0.006775454816409681, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047182}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4121212121212121, "acc_stderr,none": 0.038435669935887165}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.44607843137254904, "acc_stderr,none": 0.03488845451304974}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4978902953586498, "acc_stderr,none": 0.032546938018020076}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4380165289256198, "acc_stderr,none": 0.045291468044357915}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.42592592592592593, "acc_stderr,none": 0.0478034362693679}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3312883435582822, "acc_stderr,none": 0.03697983910025589}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.315028901734104, "acc_stderr,none": 0.025009313790069706}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.01431099954796145}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3536977491961415, "acc_stderr,none": 0.027155208103200882}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3734567901234568, "acc_stderr,none": 0.026915003011380147}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3089960886571056, "acc_stderr,none": 0.011801729777239256}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.3797875764402961, "acc_stderr,none": 0.008680271254254667, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3622641509433962, "acc_stderr,none": 0.0295822451283843}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3988439306358382, "acc_stderr,none": 0.03733626655383509}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4484304932735426, "acc_stderr,none": 0.03337883736255099}, "mmlu_management": {"alias": " - management", "acc,none": 0.3883495145631068, "acc_stderr,none": 0.048257293373563895}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.452991452991453, "acc_stderr,none": 0.0326109987309862}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.45, "acc_stderr,none": 0.04999999999999999}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.40485312899106, "acc_stderr,none": 0.017553246467720256}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.02736359328468493}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.02668456434046099}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.33455882352941174, "acc_stderr,none": 0.028661996202335307}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3614457831325301, "acc_stderr,none": 0.03740059382029321}, "mmlu_social_sciences": {"acc,none": 0.354891127721807, "acc_stderr,none": 0.008588640193149566, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30303030303030304, "acc_stderr,none": 0.032742879140268674}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.41968911917098445, "acc_stderr,none": 0.03561587327685884}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3128205128205128, "acc_stderr,none": 0.023507579020645365}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31092436974789917, "acc_stderr,none": 0.030066761582977924}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.363302752293578, "acc_stderr,none": 0.020620603919625807}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.32061068702290074, "acc_stderr,none": 0.04093329229834278}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3627450980392157, "acc_stderr,none": 0.01945076843250551}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.04631381319425464}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879815}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.47761194029850745, "acc_stderr,none": 0.03531987930208731}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_stem": {"acc,none": 0.30352045670789723, "acc_stderr,none": 0.008136846004156602, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.04171654161354543}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3402777777777778, "acc_stderr,none": 0.03962135573486219}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3446808510638298, "acc_stderr,none": 0.03106898596312215}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.03780019230438014}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25396825396825395, "acc_stderr,none": 0.02241804289111394}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4032258064516129, "acc_stderr,none": 0.027906150826041143}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.0317852971064275}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712173}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.028963702570791013}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291519}, "mmlu_pro": {"exact_match,custom-extract": 0.18018617021276595, "exact_match_stderr,custom-extract": 0.0034673431817660166, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.32217573221757323, "exact_match_stderr,custom-extract": 0.01746419040868442}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.1634980988593156, "exact_match_stderr,custom-extract": 0.013174274584326415}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10247349823321555, "exact_match_stderr,custom-extract": 0.009017748507579058}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.21707317073170732, "exact_match_stderr,custom-extract": 0.020384591313839226}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.2476303317535545, "exact_match_stderr,custom-extract": 0.014866330095923867}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.16615067079463364, "exact_match_stderr,custom-extract": 0.011963469940000507}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2200488997555012, "exact_match_stderr,custom-extract": 0.014493799859240616}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16272965879265092, "exact_match_stderr,custom-extract": 0.018935396882827787}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1371480472297911, "exact_match_stderr,custom-extract": 0.01037209807700217}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.14285714285714285, "exact_match_stderr,custom-extract": 0.009523809523809466}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17316017316017315, "exact_match_stderr,custom-extract": 0.012454716571952209}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1462925851703407, "exact_match_stderr,custom-extract": 0.015836201263905444}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15473441108545036, "exact_match_stderr,custom-extract": 0.010038127358043922}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2669172932330827, "exact_match_stderr,custom-extract": 0.015668798035500312}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.268, "acc_stderr,none": 0.019827714859587568, "acc_norm,none": 0.392, "acc_norm_stderr,none": 0.021854684955611263}, "piqa": {"alias": "piqa", "acc,none": 0.7312295973884657, "acc_stderr,none": 0.010343392940090011, "acc_norm,none": 0.720348204570185, "acc_norm_stderr,none": 0.01047189953030656}, "race": {"alias": "race", "acc,none": 0.3770334928229665, "acc_stderr,none": 0.014999337089843356}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4524053224155578, "acc_stderr,none": 0.011262695440459564}, "winogrande": {"alias": "winogrande", "acc,none": 0.6582478295185478, "acc_stderr,none": 0.013330103018622856}} {"created_at": "2025-05-03T02:21:07.841364", "global_step": 488000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4061433447098976, "acc_stderr,none": 0.014351656690097863, "acc_norm,none": 0.43430034129692835, "acc_norm_stderr,none": 0.01448470304885736}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7297979797979798, "acc_stderr,none": 0.00911200222911985, "acc_norm,none": 0.7209595959595959, "acc_norm_stderr,none": 0.009203588704032633}, "boolq": {"alias": "boolq", "acc,none": 0.763914373088685, "acc_stderr,none": 0.007427619611412126}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.28746928746928746, "acc_stderr,none": 0.012957392226225588}, "copa": {"alias": "copa", "acc,none": 0.76, "acc_stderr,none": 0.04292346959909284}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47450707030472017, "acc_stderr,none": 0.0049832915782890425, "acc_norm,none": 0.6409081856203943, "acc_norm_stderr,none": 0.004787537385153012}, "mmlu": {"acc,none": 0.341760432986754, "acc_stderr,none": 0.003966290188369764, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.33156216790648246, "acc_stderr,none": 0.0067828713916655075, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.03619604524124249}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.42424242424242425, "acc_stderr,none": 0.03859268142070262}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.47058823529411764, "acc_stderr,none": 0.03503235296367992}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.510548523206751, "acc_stderr,none": 0.032539983791662855}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4628099173553719, "acc_stderr,none": 0.04551711196104218}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.04643454608906274}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.34355828220858897, "acc_stderr,none": 0.03731133519673891}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.32947976878612717, "acc_stderr,none": 0.025305258131879716}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24022346368715083, "acc_stderr,none": 0.0142883438039253}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3762057877813505, "acc_stderr,none": 0.02751392568354943}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3765432098765432, "acc_stderr,none": 0.026959344518747784}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3044328552803129, "acc_stderr,none": 0.011752877592597579}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3567251461988304, "acc_stderr,none": 0.03674013002860954}, "mmlu_other": {"acc,none": 0.37721274541358224, "acc_stderr,none": 0.0086755519187428, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.38113207547169814, "acc_stderr,none": 0.02989060968628663}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.37572254335260113, "acc_stderr,none": 0.03692820767264867}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.32, "acc_stderr,none": 0.04688261722621504}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4304932735426009, "acc_stderr,none": 0.033231973029429394}, "mmlu_management": {"alias": " - management", "acc,none": 0.39805825242718446, "acc_stderr,none": 0.048467482539772386}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.43162393162393164, "acc_stderr,none": 0.0324483553531149}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4112388250319285, "acc_stderr,none": 0.01759597190805657}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.02736359328468494}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.34191176470588236, "acc_stderr,none": 0.028814722422254177}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.03664314777288085}, "mmlu_social_sciences": {"acc,none": 0.36529086772830677, "acc_stderr,none": 0.00863478536388612, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.04303684033537316}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3383838383838384, "acc_stderr,none": 0.03371124142626302}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43005181347150256, "acc_stderr,none": 0.035729543331448094}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.30256410256410254, "acc_stderr,none": 0.023290888053772715}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.02995382389188703}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3908256880733945, "acc_stderr,none": 0.020920058346111065}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3282442748091603, "acc_stderr,none": 0.041184385658062976}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.37745098039215685, "acc_stderr,none": 0.019610851474880276}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04607582090719976}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879818}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48756218905472637, "acc_stderr,none": 0.0353443984853958}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_stem": {"acc,none": 0.2990802410402791, "acc_stderr,none": 0.008091408185819379, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384739}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3851851851851852, "acc_stderr,none": 0.042039210401562783}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.26973684210526316, "acc_stderr,none": 0.036117805602848975}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3194444444444444, "acc_stderr,none": 0.038990736873573344}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.17647058823529413, "acc_stderr,none": 0.0379328118530781}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33617021276595743, "acc_stderr,none": 0.030881618520676942}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.30344827586206896, "acc_stderr,none": 0.038312260488503336}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4129032258064516, "acc_stderr,none": 0.028009138125400387}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.0319474007226554}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24814814814814815, "acc_stderr,none": 0.0263357394040558}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.034791855725996586}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046934}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.0443280405529152}, "mmlu_pro": {"exact_match,custom-extract": 0.18342752659574468, "exact_match_stderr,custom-extract": 0.0034929873225827486, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.32496513249651326, "exact_match_stderr,custom-extract": 0.017503503047556074}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.17363751584283904, "exact_match_stderr,custom-extract": 0.013494101406164668}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.1068904593639576, "exact_match_stderr,custom-extract": 0.009187355756744646}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.21707317073170732, "exact_match_stderr,custom-extract": 0.020384591313839226}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.245260663507109, "exact_match_stderr,custom-extract": 0.014818309281701568}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.17234262125902994, "exact_match_stderr,custom-extract": 0.012139029421075573}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22616136919315402, "exact_match_stderr,custom-extract": 0.014636033244302024}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.17585301837270342, "exact_match_stderr,custom-extract": 0.019529244892152534}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.13169845594913715, "exact_match_stderr,custom-extract": 0.010195987296692636}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.16432272390821615, "exact_match_stderr,custom-extract": 0.010085588042335562}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1590909090909091, "exact_match_stderr,custom-extract": 0.012039164679107935}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15030060120240482, "exact_match_stderr,custom-extract": 0.01601394538357726}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15242494226327943, "exact_match_stderr,custom-extract": 0.009976535615945838}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2681704260651629, "exact_match_stderr,custom-extract": 0.015692106905487214}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.264, "acc_stderr,none": 0.0197328855859221, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7306855277475517, "acc_stderr,none": 0.01035000407058876, "acc_norm,none": 0.7247007616974973, "acc_norm_stderr,none": 0.01042142927736953}, "race": {"alias": "race", "acc,none": 0.3827751196172249, "acc_stderr,none": 0.015043306814111515}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4498464687819857, "acc_stderr,none": 0.0112570083604857}, "winogrande": {"alias": "winogrande", "acc,none": 0.6637726913970008, "acc_stderr,none": 0.013277286593993442}} {"created_at": "2025-05-03T04:09:05.390677", "global_step": 490000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4035836177474403, "acc_stderr,none": 0.014337158914268457, "acc_norm,none": 0.4325938566552901, "acc_norm_stderr,none": 0.014478005694182531}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7314814814814815, "acc_stderr,none": 0.009094042554994856, "acc_norm,none": 0.7209595959595959, "acc_norm_stderr,none": 0.009203588704032633}, "boolq": {"alias": "boolq", "acc,none": 0.7642201834862385, "acc_stderr,none": 0.00742429301951045}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2891072891072891, "acc_stderr,none": 0.012979310916953698}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4751045608444533, "acc_stderr,none": 0.004983592410934164, "acc_norm,none": 0.6406094403505278, "acc_norm_stderr,none": 0.004788412062375702}, "mmlu": {"acc,none": 0.33819968665432276, "acc_stderr,none": 0.003959929814293297, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.32624867162592985, "acc_stderr,none": 0.006768024364199171, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.03567016675276863}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.43636363636363634, "acc_stderr,none": 0.03872592983524754}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46078431372549017, "acc_stderr,none": 0.03498501649369527}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.47257383966244726, "acc_stderr,none": 0.03249822718301303}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4628099173553719, "acc_stderr,none": 0.04551711196104218}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37962962962962965, "acc_stderr,none": 0.04691521224077742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.36809815950920244, "acc_stderr,none": 0.03789213935838396}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.315028901734104, "acc_stderr,none": 0.025009313790069716}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2446927374301676, "acc_stderr,none": 0.014378169884098423}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3504823151125402, "acc_stderr,none": 0.027098652621301747}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3611111111111111, "acc_stderr,none": 0.026725868809100786}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3044328552803129, "acc_stderr,none": 0.011752877592597577}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.37141937560347604, "acc_stderr,none": 0.008652684909473572, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3660377358490566, "acc_stderr,none": 0.029647813539365242}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.36416184971098264, "acc_stderr,none": 0.03669072477416907}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4125560538116592, "acc_stderr,none": 0.03304062175449296}, "mmlu_management": {"alias": " - management", "acc,none": 0.36893203883495146, "acc_stderr,none": 0.0477761518115674}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.43162393162393164, "acc_stderr,none": 0.0324483553531149}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4061302681992337, "acc_stderr,none": 0.017562037406478916}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.027363593284684955}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2907801418439716, "acc_stderr,none": 0.027090664368353178}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3272058823529412, "acc_stderr,none": 0.028501452860396563}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3373493975903614, "acc_stderr,none": 0.0368078369072758}, "mmlu_social_sciences": {"acc,none": 0.3574910627234319, "acc_stderr,none": 0.008591601703531852, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.03318477333845331}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.41450777202072536, "acc_stderr,none": 0.03555300319557672}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.28974358974358977, "acc_stderr,none": 0.023000628243687978}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3067226890756303, "acc_stderr,none": 0.02995382389188703}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3743119266055046, "acc_stderr,none": 0.020748959408988306}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.31297709923664124, "acc_stderr,none": 0.04066962905677698}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.38235294117647056, "acc_stderr,none": 0.019659922493623343}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.04631381319425465}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2938775510204082, "acc_stderr,none": 0.029162738410249772}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.47761194029850745, "acc_stderr,none": 0.03531987930208731}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_stem": {"acc,none": 0.3044719314938154, "acc_stderr,none": 0.008124896244499831, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3925925925925926, "acc_stderr,none": 0.04218506215368879}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.29605263157894735, "acc_stderr,none": 0.03715062154998905}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3402777777777778, "acc_stderr,none": 0.039621355734862175}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909284}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.18627450980392157, "acc_stderr,none": 0.038739587141493524}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.51, "acc_stderr,none": 0.05024183937956913}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33617021276595743, "acc_stderr,none": 0.030881618520676942}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02256989707491842}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4129032258064516, "acc_stderr,none": 0.02800913812540039}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.3103448275862069, "acc_stderr,none": 0.03255086769970103}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712166}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987053}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863448}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285713}, "mmlu_pro": {"exact_match,custom-extract": 0.18384308510638298, "exact_match_stderr,custom-extract": 0.0034963470869140215, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3291492329149233, "exact_match_stderr,custom-extract": 0.017561146780265928}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16856780735107732, "exact_match_stderr,custom-extract": 0.013336369763619692}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11219081272084806, "exact_match_stderr,custom-extract": 0.009384414071431937}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.2, "exact_match_stderr,custom-extract": 0.019778727057365927}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24170616113744076, "exact_match_stderr,custom-extract": 0.014745137840103598}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.15479876160990713, "exact_match_stderr,custom-extract": 0.011625887729987489}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22860635696821516, "exact_match_stderr,custom-extract": 0.014691669532004212}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16272965879265092, "exact_match_stderr,custom-extract": 0.018935396882827787}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.13169845594913715, "exact_match_stderr,custom-extract": 0.010195987296692636}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.153960029607698, "exact_match_stderr,custom-extract": 0.00982273775259319}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17316017316017315, "exact_match_stderr,custom-extract": 0.012454716571952212}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.16633266533066132, "exact_match_stderr,custom-extract": 0.016686701398526124}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16782140107775212, "exact_match_stderr,custom-extract": 0.010372766376157203}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2719298245614035, "exact_match_stderr,custom-extract": 0.015761076648135943}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.266, "acc_stderr,none": 0.019780559675655486, "acc_norm,none": 0.394, "acc_norm_stderr,none": 0.021874299301689253}, "piqa": {"alias": "piqa", "acc,none": 0.7268770402611534, "acc_stderr,none": 0.010395730264453267, "acc_norm,none": 0.720348204570185, "acc_norm_stderr,none": 0.01047189953030656}, "race": {"alias": "race", "acc,none": 0.37894736842105264, "acc_stderr,none": 0.015014241655133452}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4498464687819857, "acc_stderr,none": 0.0112570083604857}, "winogrande": {"alias": "winogrande", "acc,none": 0.6535122336227308, "acc_stderr,none": 0.013373773411685653}} {"created_at": "2025-05-03T06:08:32.025138", "global_step": 492000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4069965870307167, "acc_stderr,none": 0.014356399418009137, "acc_norm,none": 0.4402730375426621, "acc_norm_stderr,none": 0.014506769524804243}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7327441077441077, "acc_stderr,none": 0.00908046324601747, "acc_norm,none": 0.7196969696969697, "acc_norm_stderr,none": 0.009216306864088034}, "boolq": {"alias": "boolq", "acc,none": 0.772782874617737, "acc_stderr,none": 0.007328950945979886}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.28746928746928746, "acc_stderr,none": 0.012957392226225588}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47560246962756425, "acc_stderr,none": 0.0049838376415028965, "acc_norm,none": 0.641804421429994, "acc_norm_stderr,none": 0.00478490124855871}, "mmlu": {"acc,none": 0.34354080615296967, "acc_stderr,none": 0.003974731025609914, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.33156216790648246, "acc_stderr,none": 0.006794278058752767, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1984126984126984, "acc_stderr,none": 0.03567016675276863}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.44242424242424244, "acc_stderr,none": 0.038783721137112745}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46568627450980393, "acc_stderr,none": 0.03501038327635897}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.48523206751054854, "acc_stderr,none": 0.032533028078777386}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4462809917355372, "acc_stderr,none": 0.0453793517794788}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.4074074074074074, "acc_stderr,none": 0.047500773411999854}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3619631901840491, "acc_stderr,none": 0.037757007291414416}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.315028901734104, "acc_stderr,none": 0.025009313790069713}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225615}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.34726688102893893, "acc_stderr,none": 0.027040745502307336}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3734567901234568, "acc_stderr,none": 0.026915003011380147}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.31421121251629724, "acc_stderr,none": 0.011855911587048228}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.3778564531702607, "acc_stderr,none": 0.008676211645032454, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.37358490566037733, "acc_stderr,none": 0.02977308271331987}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3699421965317919, "acc_stderr,none": 0.036812296333943194}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4304932735426009, "acc_stderr,none": 0.033231973029429394}, "mmlu_management": {"alias": " - management", "acc,none": 0.39805825242718446, "acc_stderr,none": 0.048467482539772386}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.44017094017094016, "acc_stderr,none": 0.032520741720630506}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.049888765156985884}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.41379310344827586, "acc_stderr,none": 0.01761220408466377}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3464052287581699, "acc_stderr,none": 0.027245613047215355}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.28368794326241137, "acc_stderr,none": 0.02689170942834396}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.33088235294117646, "acc_stderr,none": 0.02858270975389844}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3433734939759036, "acc_stderr,none": 0.036965843170106004}, "mmlu_social_sciences": {"acc,none": 0.36626584335391615, "acc_stderr,none": 0.008652052188694258, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.32456140350877194, "acc_stderr,none": 0.044045561573747685}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.31313131313131315, "acc_stderr,none": 0.033042050878136525}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.41450777202072536, "acc_stderr,none": 0.03555300319557672}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3153846153846154, "acc_stderr,none": 0.023559646983189932}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3277310924369748, "acc_stderr,none": 0.03048991141767323}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3926605504587156, "acc_stderr,none": 0.020937505161201093}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.33587786259541985, "acc_stderr,none": 0.041423137719966634}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.37254901960784315, "acc_stderr,none": 0.019559646809215934}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.36363636363636365, "acc_stderr,none": 0.04607582090719976}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.30612244897959184, "acc_stderr,none": 0.02950489645459595}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48756218905472637, "acc_stderr,none": 0.03534439848539579}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_stem": {"acc,none": 0.3054234062797336, "acc_stderr,none": 0.008133938815204834, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384739}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3851851851851852, "acc_stderr,none": 0.042039210401562783}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.03981240543717861}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3404255319148936, "acc_stderr,none": 0.030976692998534443}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.43548387096774194, "acc_stderr,none": 0.028206225591502737}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.29064039408866993, "acc_stderr,none": 0.0319474007226554}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.35, "acc_stderr,none": 0.047937248544110196}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712166}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987053}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.028963702570791016}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "mmlu_pro": {"exact_match,custom-extract": 0.18218085106382978, "exact_match_stderr,custom-extract": 0.003484810620377296, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3179916317991632, "exact_match_stderr,custom-extract": 0.017403884250878417}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.17110266159695817, "exact_match_stderr,custom-extract": 0.013415771307906003}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10424028268551237, "exact_match_stderr,custom-extract": 0.009086199159417191}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.2121951219512195, "exact_match_stderr,custom-extract": 0.020216937884754132}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24052132701421802, "exact_match_stderr,custom-extract": 0.014720440282752337}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.1568627450980392, "exact_match_stderr,custom-extract": 0.0116888387242459}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22127139364303178, "exact_match_stderr,custom-extract": 0.014522609899199774}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.16272965879265092, "exact_match_stderr,custom-extract": 0.018935396882827787}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1344232515894641, "exact_match_stderr,custom-extract": 0.010284747799160813}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.15247964470762398, "exact_match_stderr,custom-extract": 0.00978394764898708}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17207792207792208, "exact_match_stderr,custom-extract": 0.012423857401451376}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.015561893867712511}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.1739799846035412, "exact_match_stderr,custom-extract": 0.010522224976679326}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2719298245614035, "exact_match_stderr,custom-extract": 0.01576107664813595}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.264, "acc_stderr,none": 0.019732885585922098, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7306855277475517, "acc_stderr,none": 0.01035000407058876, "acc_norm,none": 0.7247007616974973, "acc_norm_stderr,none": 0.01042142927736953}, "race": {"alias": "race", "acc,none": 0.3799043062200957, "acc_stderr,none": 0.01502160080493565}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.4508700102354145, "acc_stderr,none": 0.011259319269273938}, "winogrande": {"alias": "winogrande", "acc,none": 0.6566692975532754, "acc_stderr,none": 0.01334482318535801}} {"created_at": "2025-05-03T07:42:11.506429", "global_step": 494000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4035836177474403, "acc_stderr,none": 0.014337158914268457, "acc_norm,none": 0.43856655290102387, "acc_norm_stderr,none": 0.014500682618212864}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7331649831649831, "acc_stderr,none": 0.009075915859267258, "acc_norm,none": 0.7171717171717171, "acc_norm_stderr,none": 0.009241472775328228}, "boolq": {"alias": "boolq", "acc,none": 0.7691131498470948, "acc_stderr,none": 0.007370335500493339}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2915642915642916, "acc_stderr,none": 0.013011802821401595}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47520414260107546, "acc_stderr,none": 0.004983641854351153, "acc_norm,none": 0.6419040031866162, "acc_norm_stderr,none": 0.004784607222774649}, "mmlu": {"acc,none": 0.3453923942458339, "acc_stderr,none": 0.003980840239935939, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3292242295430393, "acc_stderr,none": 0.0067810483739687176, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.03619604524124249}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.42424242424242425, "acc_stderr,none": 0.03859268142070262}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46078431372549017, "acc_stderr,none": 0.03498501649369527}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.4978902953586498, "acc_stderr,none": 0.032546938018020076}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4462809917355372, "acc_stderr,none": 0.04537935177947879}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.37962962962962965, "acc_stderr,none": 0.04691521224077742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3619631901840491, "acc_stderr,none": 0.037757007291414416}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3208092485549133, "acc_stderr,none": 0.025131000233647897}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24581005586592178, "acc_stderr,none": 0.014400296429225617}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3504823151125402, "acc_stderr,none": 0.027098652621301744}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3765432098765432, "acc_stderr,none": 0.026959344518747787}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30378096479791394, "acc_stderr,none": 0.011745787720472465}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.34502923976608185, "acc_stderr,none": 0.036459813773888065}, "mmlu_other": {"acc,none": 0.3817186997103315, "acc_stderr,none": 0.008701055208963249, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.36981132075471695, "acc_stderr,none": 0.02971142188010792}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3815028901734104, "acc_stderr,none": 0.03703851193099521}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4349775784753363, "acc_stderr,none": 0.033272833702713445}, "mmlu_management": {"alias": " - management", "acc,none": 0.39805825242718446, "acc_stderr,none": 0.048467482539772386}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.44871794871794873, "acc_stderr,none": 0.0325833464938688}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4086845466155811, "acc_stderr,none": 0.017579250148153397}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.027363593284684937}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3049645390070922, "acc_stderr,none": 0.02746470844202213}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.35661764705882354, "acc_stderr,none": 0.02909720956841195}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3253012048192771, "acc_stderr,none": 0.03647168523683227}, "mmlu_social_sciences": {"acc,none": 0.3721156971075723, "acc_stderr,none": 0.008688508540966224, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.32456140350877194, "acc_stderr,none": 0.044045561573747685}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.033586181457325226}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43005181347150256, "acc_stderr,none": 0.035729543331448094}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.33076923076923076, "acc_stderr,none": 0.023854795680971114}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3277310924369748, "acc_stderr,none": 0.03048991141767323}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3963302752293578, "acc_stderr,none": 0.02097146994790053}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3435114503816794, "acc_stderr,none": 0.041649760719448786}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.380718954248366, "acc_stderr,none": 0.01964380155792481}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.35454545454545455, "acc_stderr,none": 0.04582004841505416}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.31020408163265306, "acc_stderr,none": 0.029613459872484378}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.47761194029850745, "acc_stderr,none": 0.03531987930208731}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_stem": {"acc,none": 0.3076435141135427, "acc_stderr,none": 0.008155043722905417, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384739}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3851851851851852, "acc_stderr,none": 0.042039210401562783}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.035834961763610625}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.03981240543717861}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.22549019607843138, "acc_stderr,none": 0.041583075330832865}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3574468085106383, "acc_stderr,none": 0.03132941789476425}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2827586206896552, "acc_stderr,none": 0.037528339580033376}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.022789673145776564}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.42258064516129035, "acc_stderr,none": 0.02810096472427264}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145633}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.02671924078371217}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.034791855725996586}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046934}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3125, "acc_stderr,none": 0.043994650575715215}, "mmlu_pro": {"exact_match,custom-extract": 0.18267952127659576, "exact_match_stderr,custom-extract": 0.003492826921774581, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.30264993026499304, "exact_match_stderr,custom-extract": 0.017168770774579085}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16730038022813687, "exact_match_stderr,custom-extract": 0.01329626121085858}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11749116607773852, "exact_match_stderr,custom-extract": 0.009574824784172204}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.22195121951219512, "exact_match_stderr,custom-extract": 0.02054804589006829}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.23933649289099526, "exact_match_stderr,custom-extract": 0.014695587900810762}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.15583075335397317, "exact_match_stderr,custom-extract": 0.011657452909176065}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2310513447432763, "exact_match_stderr,custom-extract": 0.014746599750625127}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.17060367454068243, "exact_match_stderr,custom-extract": 0.019296717799305904}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1307901907356948, "exact_match_stderr,custom-extract": 0.010166080711813392}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.15247964470762398, "exact_match_stderr,custom-extract": 0.009783947648987079}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17532467532467533, "exact_match_stderr,custom-extract": 0.01251590249750937}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15831663326653306, "exact_match_stderr,custom-extract": 0.01635772767882581}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16320246343341033, "exact_match_stderr,custom-extract": 0.010257374338618739}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2694235588972431, "exact_match_stderr,custom-extract": 0.015715255828559514}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.264, "acc_stderr,none": 0.019732885585922098, "acc_norm,none": 0.398, "acc_norm_stderr,none": 0.021912377885779974}, "piqa": {"alias": "piqa", "acc,none": 0.7301414581066377, "acc_stderr,none": 0.010356595421852209, "acc_norm,none": 0.7225244831338411, "acc_norm_stderr,none": 0.010446818281039947}, "race": {"alias": "race", "acc,none": 0.3827751196172249, "acc_stderr,none": 0.015043306814111515}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45138178096212894, "acc_stderr,none": 0.01126045668162444}, "winogrande": {"alias": "winogrande", "acc,none": 0.6621941594317285, "acc_stderr,none": 0.013292583502910885}} {"created_at": "2025-05-03T10:14:24.328785", "global_step": 496000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4052901023890785, "acc_stderr,none": 0.014346869060229337, "acc_norm,none": 0.44283276450511944, "acc_norm_stderr,none": 0.014515573873348904}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7306397306397306, "acc_stderr,none": 0.009103043207756983, "acc_norm,none": 0.718013468013468, "acc_norm_stderr,none": 0.009233124071053646}, "boolq": {"alias": "boolq", "acc,none": 0.7697247706422018, "acc_stderr,none": 0.007363493078403616}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.29074529074529076, "acc_stderr,none": 0.013001023498635364}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.47540330611431986, "acc_stderr,none": 0.004983740145218612, "acc_norm,none": 0.6417048396733719, "acc_norm_stderr,none": 0.0047851950498891595}, "mmlu": {"acc,none": 0.34425295541945594, "acc_stderr,none": 0.0039768596251749, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3294367693942614, "acc_stderr,none": 0.00678167572474211, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03670066451047181}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.43636363636363634, "acc_stderr,none": 0.03872592983524754}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46078431372549017, "acc_stderr,none": 0.03498501649369527}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5021097046413502, "acc_stderr,none": 0.032546938018020076}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4380165289256198, "acc_stderr,none": 0.045291468044357915}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.047128212574267705}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3558282208588957, "acc_stderr,none": 0.03761521380046734}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.315028901734104, "acc_stderr,none": 0.02500931379006971}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2424581005586592, "acc_stderr,none": 0.014333522059217892}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3440514469453376, "acc_stderr,none": 0.026981478043648022}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.026869490744815247}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.31029986962190353, "acc_stderr,none": 0.011815439293469825}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.32748538011695905, "acc_stderr,none": 0.035993357714560276}, "mmlu_other": {"acc,none": 0.37914386868361766, "acc_stderr,none": 0.008687635435215911, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.36981132075471695, "acc_stderr,none": 0.02971142188010792}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3699421965317919, "acc_stderr,none": 0.036812296333943194}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4304932735426009, "acc_stderr,none": 0.033231973029429394}, "mmlu_management": {"alias": " - management", "acc,none": 0.42718446601941745, "acc_stderr,none": 0.04897957737781168}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.44017094017094016, "acc_stderr,none": 0.032520741720630506}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.43, "acc_stderr,none": 0.04975698519562428}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4086845466155811, "acc_stderr,none": 0.017579250148153397}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.3562091503267974, "acc_stderr,none": 0.02742047766262923}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.29432624113475175, "acc_stderr,none": 0.0271871270115038}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3382352941176471, "acc_stderr,none": 0.028739328513983576}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.03664314777288085}, "mmlu_social_sciences": {"acc,none": 0.36821579460513487, "acc_stderr,none": 0.008664461483418769, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.32456140350877194, "acc_stderr,none": 0.044045561573747685}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3181818181818182, "acc_stderr,none": 0.03318477333845331}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.42487046632124353, "acc_stderr,none": 0.035674713352125395}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3282051282051282, "acc_stderr,none": 0.023807633198657262}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.33613445378151263, "acc_stderr,none": 0.030684737115135363}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3963302752293578, "acc_stderr,none": 0.02097146994790053}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.3282442748091603, "acc_stderr,none": 0.041184385658062976}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.3660130718954248, "acc_stderr,none": 0.01948802574552967}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.04631381319425464}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879818}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48258706467661694, "acc_stderr,none": 0.03533389234739245}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_stem": {"acc,none": 0.30859498889946085, "acc_stderr,none": 0.008153618277361684, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3925925925925926, "acc_stderr,none": 0.04218506215368879}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2631578947368421, "acc_stderr,none": 0.035834961763610625}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3541666666666667, "acc_stderr,none": 0.039994111357535424}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.34893617021276596, "acc_stderr,none": 0.031158522131357773}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.432258064516129, "acc_stderr,none": 0.02818173972001941}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.30049261083743845, "acc_stderr,none": 0.03225799476233484}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.02696242432507383}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23178807947019867, "acc_stderr,none": 0.03445406271987053}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.028963702570791016}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.3482142857142857, "acc_stderr,none": 0.045218299028335865}, "mmlu_pro": {"exact_match,custom-extract": 0.18201462765957446, "exact_match_stderr,custom-extract": 0.003484247706736926, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.31520223152022314, "exact_match_stderr,custom-extract": 0.01736278145457227}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16476552598225602, "exact_match_stderr,custom-extract": 0.013215216167850041}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.10865724381625441, "exact_match_stderr,custom-extract": 0.009253806404080385}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.21951219512195122, "exact_match_stderr,custom-extract": 0.020466837110489664}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.23933649289099526, "exact_match_stderr,custom-extract": 0.014695587900810766}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.15995872033023736, "exact_match_stderr,custom-extract": 0.011781934278001626}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22616136919315402, "exact_match_stderr,custom-extract": 0.014636033244302024}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.17060367454068243, "exact_match_stderr,custom-extract": 0.019296717799305904}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1307901907356948, "exact_match_stderr,custom-extract": 0.010166080711813392}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.16136195410806808, "exact_match_stderr,custom-extract": 0.010012002939971329}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1699134199134199, "exact_match_stderr,custom-extract": 0.01236159999946829}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.14829659318637275, "exact_match_stderr,custom-extract": 0.0159255744939775}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15704387990762125, "exact_match_stderr,custom-extract": 0.010098936605569635}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2719298245614035, "exact_match_stderr,custom-extract": 0.015761076648135947}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.266, "acc_stderr,none": 0.019780559675655486, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7279651795429815, "acc_stderr,none": 0.01038276378624738, "acc_norm,none": 0.7181719260065288, "acc_norm_stderr,none": 0.010496675231258164}, "race": {"alias": "race", "acc,none": 0.38086124401913873, "acc_stderr,none": 0.015028897988042758}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45138178096212894, "acc_stderr,none": 0.011260456681624438}, "winogrande": {"alias": "winogrande", "acc,none": 0.665351223362273, "acc_stderr,none": 0.013261823629558366}} {"created_at": "2025-05-03T12:12:27.468408", "global_step": 498000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.4052901023890785, "acc_stderr,none": 0.014346869060229337, "acc_norm,none": 0.44368600682593856, "acc_norm_stderr,none": 0.014518421825670444}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7302188552188552, "acc_stderr,none": 0.009107527914671064, "acc_norm,none": 0.7171717171717171, "acc_norm_stderr,none": 0.009241472775328228}, "boolq": {"alias": "boolq", "acc,none": 0.7691131498470948, "acc_stderr,none": 0.007370335500493343}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2891072891072891, "acc_stderr,none": 0.0129793109169537}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4751045608444533, "acc_stderr,none": 0.004983592410934164, "acc_norm,none": 0.6428002389962159, "acc_norm_stderr,none": 0.004781950883460504}, "mmlu": {"acc,none": 0.34297108673978066, "acc_stderr,none": 0.003972238006364669, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.3294367693942614, "acc_stderr,none": 0.006776167208504884, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.0361960452412425}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4484848484848485, "acc_stderr,none": 0.038835659779569286}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46568627450980393, "acc_stderr,none": 0.03501038327635897}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5021097046413502, "acc_stderr,none": 0.032546938018020076}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4462809917355372, "acc_stderr,none": 0.0453793517794788}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.047128212574267705}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3558282208588957, "acc_stderr,none": 0.03761521380046734}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.3092485549132948, "acc_stderr,none": 0.024883140570071755}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24134078212290502, "acc_stderr,none": 0.014310999547961445}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3440514469453376, "acc_stderr,none": 0.026981478043648022}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.3734567901234568, "acc_stderr,none": 0.026915003011380147}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.30834419817470665, "acc_stderr,none": 0.011794833789715329}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3391812865497076, "acc_stderr,none": 0.03631053496488905}, "mmlu_other": {"acc,none": 0.3794657225619569, "acc_stderr,none": 0.008690981024699845, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.36981132075471695, "acc_stderr,none": 0.02971142188010792}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.3699421965317919, "acc_stderr,none": 0.036812296333943194}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.42152466367713004, "acc_stderr,none": 0.033141902221106564}, "mmlu_management": {"alias": " - management", "acc,none": 0.4077669902912621, "acc_stderr,none": 0.04865777570410768}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.452991452991453, "acc_stderr,none": 0.0326109987309862}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.42, "acc_stderr,none": 0.049604496374885836}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.4086845466155811, "acc_stderr,none": 0.017579250148153397}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35947712418300654, "acc_stderr,none": 0.027475969910660952}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2978723404255319, "acc_stderr,none": 0.027281608344469414}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.34558823529411764, "acc_stderr,none": 0.02888819310398864}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3313253012048193, "acc_stderr,none": 0.03664314777288085}, "mmlu_social_sciences": {"acc,none": 0.365940851478713, "acc_stderr,none": 0.0086458358023838, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.04372748290278008}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.033586181457325226}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43005181347150256, "acc_stderr,none": 0.035729543331448094}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.31025641025641026, "acc_stderr,none": 0.023454674889404288}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.31932773109243695, "acc_stderr,none": 0.0302839955258844}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3981651376146789, "acc_stderr,none": 0.020987989422654264}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.32061068702290074, "acc_stderr,none": 0.04093329229834278}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.36437908496732024, "acc_stderr,none": 0.0194695182215737}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.39090909090909093, "acc_stderr,none": 0.04673752333670238}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879818}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48756218905472637, "acc_stderr,none": 0.0353443984853958}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.47, "acc_stderr,none": 0.050161355804659205}, "mmlu_stem": {"acc,none": 0.3047890897557881, "acc_stderr,none": 0.00813445737237051, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.4, "acc_stderr,none": 0.042320736951515885}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.27631578947368424, "acc_stderr,none": 0.03639057569952925}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.03981240543717861}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.040925639582376556}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.33617021276595743, "acc_stderr,none": 0.030881618520676942}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2896551724137931, "acc_stderr,none": 0.03780019230438014}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2566137566137566, "acc_stderr,none": 0.022494510767503154}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.4290322580645161, "acc_stderr,none": 0.028156036538233217}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.031785297106427496}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.26666666666666666, "acc_stderr,none": 0.026962424325073835}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.034791855725996586}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.028765111718046934}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.32142857142857145, "acc_stderr,none": 0.04432804055291519}, "mmlu_pro": {"exact_match,custom-extract": 0.18367686170212766, "exact_match_stderr,custom-extract": 0.003498964616069176, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3179916317991632, "exact_match_stderr,custom-extract": 0.017403884250878413}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16856780735107732, "exact_match_stderr,custom-extract": 0.013336369763619692}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.1157243816254417, "exact_match_stderr,custom-extract": 0.009512068239624288}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.21707317073170732, "exact_match_stderr,custom-extract": 0.020384591313839226}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24170616113744076, "exact_match_stderr,custom-extract": 0.014745137840103598}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.16615067079463364, "exact_match_stderr,custom-extract": 0.011963469940000524}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.2237163814180929, "exact_match_stderr,custom-extract": 0.014579682804410434}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1679790026246719, "exact_match_stderr,custom-extract": 0.01917797923756724}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.14168937329700274, "exact_match_stderr,custom-extract": 0.010514643243492047}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.153960029607698, "exact_match_stderr,custom-extract": 0.00982273775259319}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.1764069264069264, "exact_match_stderr,custom-extract": 0.01254623184906948}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.1402805611222445, "exact_match_stderr,custom-extract": 0.015561893867712511}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.15935334872979215, "exact_match_stderr,custom-extract": 0.010158977410017719}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2669172932330827, "exact_match_stderr,custom-extract": 0.015668798035500312}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.266, "acc_stderr,none": 0.019780559675655486, "acc_norm,none": 0.394, "acc_norm_stderr,none": 0.021874299301689253}, "piqa": {"alias": "piqa", "acc,none": 0.7295973884657236, "acc_stderr,none": 0.010363167031620796, "acc_norm,none": 0.7236126224156693, "acc_norm_stderr,none": 0.010434162388275613}, "race": {"alias": "race", "acc,none": 0.38086124401913873, "acc_stderr,none": 0.015028897988042758}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45291709314227224, "acc_stderr,none": 0.01126379679411243}, "winogrande": {"alias": "winogrande", "acc,none": 0.6566692975532754, "acc_stderr,none": 0.013344823185358009}} {"created_at": "2025-05-03T13:12:02.749816", "global_step": 500000, "arc_challenge": {"alias": "arc_challenge", "acc,none": 0.40187713310580203, "acc_stderr,none": 0.014327268614578281, "acc_norm,none": 0.43856655290102387, "acc_norm_stderr,none": 0.014500682618212864}, "arc_easy": {"alias": "arc_easy", "acc,none": 0.7314814814814815, "acc_stderr,none": 0.009094042554994856, "acc_norm,none": 0.7192760942760943, "acc_norm_stderr,none": 0.009220526174711358}, "boolq": {"alias": "boolq", "acc,none": 0.7700305810397553, "acc_stderr,none": 0.007360063651505803}, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.2841932841932842, "acc_stderr,none": 0.012912932309514279}, "copa": {"alias": "copa", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4751045608444533, "acc_stderr,none": 0.0049835924109341645, "acc_norm,none": 0.6420035849432384, "acc_norm_stderr,none": 0.004784312972495384}, "mmlu": {"acc,none": 0.34261501210653755, "acc_stderr,none": 0.003972341794408205, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.33049946865037194, "acc_stderr,none": 0.0067835256855849485, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.19047619047619047, "acc_stderr,none": 0.03512207412302052}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.4303030303030303, "acc_stderr,none": 0.03866225962879077}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.46568627450980393, "acc_stderr,none": 0.03501038327635897}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.5021097046413502, "acc_stderr,none": 0.032546938018020076}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.4380165289256198, "acc_stderr,none": 0.045291468044357915}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.3888888888888889, "acc_stderr,none": 0.047128212574267705}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.36809815950920244, "acc_stderr,none": 0.03789213935838396}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.315028901734104, "acc_stderr,none": 0.02500931379006971}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2435754189944134, "acc_stderr,none": 0.014355911964767865}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.3633440514469453, "acc_stderr,none": 0.02731684767419271}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.37037037037037035, "acc_stderr,none": 0.026869490744815244}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.3089960886571056, "acc_stderr,none": 0.011801729777239254}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.036155076303109344}, "mmlu_other": {"acc,none": 0.3759253299002253, "acc_stderr,none": 0.008671719910651066, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.38, "acc_stderr,none": 0.04878317312145632}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.3660377358490566, "acc_stderr,none": 0.02964781353936524}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.37572254335260113, "acc_stderr,none": 0.03692820767264867}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.4304932735426009, "acc_stderr,none": 0.033231973029429394}, "mmlu_management": {"alias": " - management", "acc,none": 0.39805825242718446, "acc_stderr,none": 0.048467482539772386}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.44017094017094016, "acc_stderr,none": 0.032520741720630506}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.44, "acc_stderr,none": 0.04988876515698589}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.40357598978288634, "acc_stderr,none": 0.017544332237926417}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.35294117647058826, "acc_stderr,none": 0.027363593284684934}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2907801418439716, "acc_stderr,none": 0.027090664368353178}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.33088235294117646, "acc_stderr,none": 0.02858270975389844}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.3253012048192771, "acc_stderr,none": 0.03647168523683227}, "mmlu_social_sciences": {"acc,none": 0.3639909002274943, "acc_stderr,none": 0.008636500524938126, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.04372748290278008}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.32323232323232326, "acc_stderr,none": 0.03332299921070644}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.43523316062176165, "acc_stderr,none": 0.035780381650085846}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.3076923076923077, "acc_stderr,none": 0.023400928918310485}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.3319327731092437, "acc_stderr,none": 0.030588697013783663}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3834862385321101, "acc_stderr,none": 0.020847156641915984}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.32061068702290074, "acc_stderr,none": 0.04093329229834278}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.36764705882352944, "acc_stderr,none": 0.019506291693954843}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.37272727272727274, "acc_stderr,none": 0.04631381319425464}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3020408163265306, "acc_stderr,none": 0.029393609319879818}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.48756218905472637, "acc_stderr,none": 0.03534439848539579}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.48, "acc_stderr,none": 0.050211673156867795}, "mmlu_stem": {"acc,none": 0.3070091975895972, "acc_stderr,none": 0.008151207994643158, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.3851851851851852, "acc_stderr,none": 0.042039210401562783}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.28289473684210525, "acc_stderr,none": 0.03665349695640767}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.3472222222222222, "acc_stderr,none": 0.03981240543717861}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617747}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.49, "acc_stderr,none": 0.05024183937956911}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3446808510638298, "acc_stderr,none": 0.03106898596312215}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.038061426873099935}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525218}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.42258064516129035, "acc_stderr,none": 0.02810096472427264}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2955665024630542, "acc_stderr,none": 0.032104944337514575}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.36, "acc_stderr,none": 0.048241815132442176}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.026466117538959912}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.24503311258278146, "acc_stderr,none": 0.03511807571804725}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.22685185185185186, "acc_stderr,none": 0.028561650102422252}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285714}, "mmlu_pro": {"exact_match,custom-extract": 0.18367686170212766, "exact_match_stderr,custom-extract": 0.003498581572180752, "alias": "mmlu_pro"}, "mmlu_pro_biology": {"alias": " - biology", "exact_match,custom-extract": 0.3138075313807531, "exact_match_stderr,custom-extract": 0.017341958540453725}, "mmlu_pro_business": {"alias": " - business", "exact_match,custom-extract": 0.16603295310519645, "exact_match_stderr,custom-extract": 0.013255877519716387}, "mmlu_pro_chemistry": {"alias": " - chemistry", "exact_match,custom-extract": 0.11130742049469965, "exact_match_stderr,custom-extract": 0.009352043831026096}, "mmlu_pro_computer_science": {"alias": " - computer_science", "exact_match,custom-extract": 0.22682926829268293, "exact_match_stderr,custom-extract": 0.020707401045044642}, "mmlu_pro_economics": {"alias": " - economics", "exact_match,custom-extract": 0.24052132701421802, "exact_match_stderr,custom-extract": 0.014720440282752337}, "mmlu_pro_engineering": {"alias": " - engineering", "exact_match,custom-extract": 0.16718266253869968, "exact_match_stderr,custom-extract": 0.011993137667893426}, "mmlu_pro_health": {"alias": " - health", "exact_match,custom-extract": 0.22493887530562348, "exact_match_stderr,custom-extract": 0.014607947807460348}, "mmlu_pro_history": {"alias": " - history", "exact_match,custom-extract": 0.1679790026246719, "exact_match_stderr,custom-extract": 0.019177979237567245}, "mmlu_pro_law": {"alias": " - law", "exact_match,custom-extract": 0.1335149863760218, "exact_match_stderr,custom-extract": 0.01025531945289569}, "mmlu_pro_math": {"alias": " - math", "exact_match,custom-extract": 0.15544041450777202, "exact_match_stderr,custom-extract": 0.00986121065535071}, "mmlu_pro_other": {"alias": " - other", "exact_match,custom-extract": 0.17316017316017315, "exact_match_stderr,custom-extract": 0.012454716571952212}, "mmlu_pro_philosophy": {"alias": " - philosophy", "exact_match,custom-extract": 0.15030060120240482, "exact_match_stderr,custom-extract": 0.01601394538357726}, "mmlu_pro_physics": {"alias": " - physics", "exact_match,custom-extract": 0.16628175519630484, "exact_match_stderr,custom-extract": 0.01033462224083037}, "mmlu_pro_psychology": {"alias": " - psychology", "exact_match,custom-extract": 0.2681704260651629, "exact_match_stderr,custom-extract": 0.01569210690548721}, "openbookqa": {"alias": "openbookqa", "acc,none": 0.264, "acc_stderr,none": 0.019732885585922098, "acc_norm,none": 0.396, "acc_norm_stderr,none": 0.021893529941665817}, "piqa": {"alias": "piqa", "acc,none": 0.7306855277475517, "acc_stderr,none": 0.01035000407058876, "acc_norm,none": 0.7225244831338411, "acc_norm_stderr,none": 0.010446818281039945}, "race": {"alias": "race", "acc,none": 0.3837320574162679, "acc_stderr,none": 0.015050418634703647}, "social_iqa": {"alias": "social_iqa", "acc,none": 0.45291709314227224, "acc_stderr,none": 0.01126379679411243}, "winogrande": {"alias": "winogrande", "acc,none": 0.6614048934490924, "acc_stderr,none": 0.013300169865842416}}