{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.992108229988726, "eval_steps": 500, "global_step": 110, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009019165727170236, "grad_norm": 0.8173778653144836, "learning_rate": 4.9989804820704735e-05, "loss": 0.8968, "num_input_tokens_seen": 2097152, "step": 1 }, { "epoch": 0.018038331454340473, "grad_norm": 0.5661506056785583, "learning_rate": 4.995922759815339e-05, "loss": 0.8202, "num_input_tokens_seen": 4194304, "step": 2 }, { "epoch": 0.02705749718151071, "grad_norm": 0.4142468571662903, "learning_rate": 4.9908293271567286e-05, "loss": 0.7783, "num_input_tokens_seen": 6291456, "step": 3 }, { "epoch": 0.036076662908680945, "grad_norm": 0.2738800048828125, "learning_rate": 4.9837043383713753e-05, "loss": 0.7642, "num_input_tokens_seen": 8388608, "step": 4 }, { "epoch": 0.04509582863585118, "grad_norm": 0.2011900395154953, "learning_rate": 4.9745536047023324e-05, "loss": 0.7475, "num_input_tokens_seen": 10485760, "step": 5 }, { "epoch": 0.05411499436302142, "grad_norm": 0.161564901471138, "learning_rate": 4.963384589619233e-05, "loss": 0.7319, "num_input_tokens_seen": 12582912, "step": 6 }, { "epoch": 0.06313416009019165, "grad_norm": 0.14298053085803986, "learning_rate": 4.9502064027309836e-05, "loss": 0.7125, "num_input_tokens_seen": 14680064, "step": 7 }, { "epoch": 0.07215332581736189, "grad_norm": 0.14268024265766144, "learning_rate": 4.935029792355834e-05, "loss": 0.7117, "num_input_tokens_seen": 16777216, "step": 8 }, { "epoch": 0.08117249154453213, "grad_norm": 0.1712426245212555, "learning_rate": 4.917867136754893e-05, "loss": 0.724, "num_input_tokens_seen": 18874368, "step": 9 }, { "epoch": 0.09019165727170236, "grad_norm": 0.15666206181049347, "learning_rate": 4.898732434036244e-05, "loss": 0.7144, "num_input_tokens_seen": 20971520, "step": 10 }, { "epoch": 0.0992108229988726, "grad_norm": 0.139165461063385, "learning_rate": 4.877641290737884e-05, "loss": 0.697, "num_input_tokens_seen": 23068672, "step": 11 }, { "epoch": 0.10822998872604284, "grad_norm": 0.15043741464614868, "learning_rate": 4.854610909098812e-05, "loss": 0.7117, "num_input_tokens_seen": 25165824, "step": 12 }, { "epoch": 0.11724915445321307, "grad_norm": 0.13478276133537292, "learning_rate": 4.829660073028631e-05, "loss": 0.6996, "num_input_tokens_seen": 27262976, "step": 13 }, { "epoch": 0.1262683201803833, "grad_norm": 0.12838861346244812, "learning_rate": 4.802809132787125e-05, "loss": 0.7172, "num_input_tokens_seen": 29360128, "step": 14 }, { "epoch": 0.13528748590755355, "grad_norm": 0.11803996562957764, "learning_rate": 4.774079988386296e-05, "loss": 0.725, "num_input_tokens_seen": 31457280, "step": 15 }, { "epoch": 0.14430665163472378, "grad_norm": 0.11841780692338943, "learning_rate": 4.743496071728396e-05, "loss": 0.7073, "num_input_tokens_seen": 33554432, "step": 16 }, { "epoch": 0.15332581736189402, "grad_norm": 0.11901742964982986, "learning_rate": 4.711082327494536e-05, "loss": 0.7021, "num_input_tokens_seen": 35651584, "step": 17 }, { "epoch": 0.16234498308906425, "grad_norm": 0.11239754408597946, "learning_rate": 4.6768651927994434e-05, "loss": 0.6894, "num_input_tokens_seen": 37748736, "step": 18 }, { "epoch": 0.1713641488162345, "grad_norm": 0.10254418104887009, "learning_rate": 4.640872575628973e-05, "loss": 0.7031, "num_input_tokens_seen": 39845888, "step": 19 }, { "epoch": 0.18038331454340473, "grad_norm": 0.09802790731191635, "learning_rate": 4.6031338320779534e-05, "loss": 0.6629, "num_input_tokens_seen": 41943040, "step": 20 }, { "epoch": 0.18940248027057496, "grad_norm": 0.0993066355586052, "learning_rate": 4.563679742406935e-05, "loss": 0.6774, "num_input_tokens_seen": 44040192, "step": 21 }, { "epoch": 0.1984216459977452, "grad_norm": 0.09300491958856583, "learning_rate": 4.522542485937369e-05, "loss": 0.7041, "num_input_tokens_seen": 46137344, "step": 22 }, { "epoch": 0.20744081172491544, "grad_norm": 0.08633296191692352, "learning_rate": 4.479755614805688e-05, "loss": 0.6894, "num_input_tokens_seen": 48234496, "step": 23 }, { "epoch": 0.21645997745208567, "grad_norm": 0.0841744914650917, "learning_rate": 4.4353540265977064e-05, "loss": 0.673, "num_input_tokens_seen": 50331648, "step": 24 }, { "epoch": 0.2254791431792559, "grad_norm": 0.09111865609884262, "learning_rate": 4.389373935885646e-05, "loss": 0.6691, "num_input_tokens_seen": 52428800, "step": 25 }, { "epoch": 0.23449830890642615, "grad_norm": 0.08103901892900467, "learning_rate": 4.341852844691012e-05, "loss": 0.6794, "num_input_tokens_seen": 54525952, "step": 26 }, { "epoch": 0.24351747463359638, "grad_norm": 0.08423160016536713, "learning_rate": 4.292829511897409e-05, "loss": 0.6946, "num_input_tokens_seen": 56623104, "step": 27 }, { "epoch": 0.2525366403607666, "grad_norm": 0.09218155592679977, "learning_rate": 4.242343921638234e-05, "loss": 0.6939, "num_input_tokens_seen": 58720256, "step": 28 }, { "epoch": 0.2615558060879369, "grad_norm": 0.07839576154947281, "learning_rate": 4.1904372506850484e-05, "loss": 0.6759, "num_input_tokens_seen": 60817408, "step": 29 }, { "epoch": 0.2705749718151071, "grad_norm": 0.08427103608846664, "learning_rate": 4.137151834863213e-05, "loss": 0.6856, "num_input_tokens_seen": 62914560, "step": 30 }, { "epoch": 0.27959413754227735, "grad_norm": 0.08769369125366211, "learning_rate": 4.082531134522176e-05, "loss": 0.6753, "num_input_tokens_seen": 65011712, "step": 31 }, { "epoch": 0.28861330326944756, "grad_norm": 0.09144359081983566, "learning_rate": 4.0266196990885955e-05, "loss": 0.6769, "num_input_tokens_seen": 67108864, "step": 32 }, { "epoch": 0.2976324689966178, "grad_norm": 0.08329298347234726, "learning_rate": 3.969463130731183e-05, "loss": 0.6664, "num_input_tokens_seen": 69206016, "step": 33 }, { "epoch": 0.30665163472378804, "grad_norm": 0.07759370654821396, "learning_rate": 3.911108047166924e-05, "loss": 0.6383, "num_input_tokens_seen": 71303168, "step": 34 }, { "epoch": 0.3156708004509583, "grad_norm": 0.07686188071966171, "learning_rate": 3.851602043638994e-05, "loss": 0.6726, "num_input_tokens_seen": 73400320, "step": 35 }, { "epoch": 0.3246899661781285, "grad_norm": 0.0853535607457161, "learning_rate": 3.790993654097405e-05, "loss": 0.6612, "num_input_tokens_seen": 75497472, "step": 36 }, { "epoch": 0.3337091319052988, "grad_norm": 0.08194194734096527, "learning_rate": 3.72933231161401e-05, "loss": 0.6763, "num_input_tokens_seen": 77594624, "step": 37 }, { "epoch": 0.342728297632469, "grad_norm": 0.0810617133975029, "learning_rate": 3.6666683080641846e-05, "loss": 0.6533, "num_input_tokens_seen": 79691776, "step": 38 }, { "epoch": 0.35174746335963925, "grad_norm": 0.08145678043365479, "learning_rate": 3.603052753108053e-05, "loss": 0.6598, "num_input_tokens_seen": 81788928, "step": 39 }, { "epoch": 0.36076662908680945, "grad_norm": 0.07560884952545166, "learning_rate": 3.5385375325047166e-05, "loss": 0.6801, "num_input_tokens_seen": 83886080, "step": 40 }, { "epoch": 0.3697857948139797, "grad_norm": 0.06700561195611954, "learning_rate": 3.4731752657934794e-05, "loss": 0.6668, "num_input_tokens_seen": 85983232, "step": 41 }, { "epoch": 0.3788049605411499, "grad_norm": 0.076134592294693, "learning_rate": 3.4070192633766025e-05, "loss": 0.6756, "num_input_tokens_seen": 88080384, "step": 42 }, { "epoch": 0.3878241262683202, "grad_norm": 0.06908991187810898, "learning_rate": 3.3401234830385756e-05, "loss": 0.6589, "num_input_tokens_seen": 90177536, "step": 43 }, { "epoch": 0.3968432919954904, "grad_norm": 0.07035559415817261, "learning_rate": 3.272542485937369e-05, "loss": 0.6435, "num_input_tokens_seen": 92274688, "step": 44 }, { "epoch": 0.40586245772266066, "grad_norm": 0.06883817166090012, "learning_rate": 3.2043313921035743e-05, "loss": 0.6602, "num_input_tokens_seen": 94371840, "step": 45 }, { "epoch": 0.41488162344983087, "grad_norm": 0.06745729595422745, "learning_rate": 3.135545835483718e-05, "loss": 0.648, "num_input_tokens_seen": 96468992, "step": 46 }, { "epoch": 0.42390078917700114, "grad_norm": 0.06895168870687485, "learning_rate": 3.0662419185644115e-05, "loss": 0.6493, "num_input_tokens_seen": 98566144, "step": 47 }, { "epoch": 0.43291995490417134, "grad_norm": 0.06701447814702988, "learning_rate": 2.996476166614364e-05, "loss": 0.6701, "num_input_tokens_seen": 100663296, "step": 48 }, { "epoch": 0.4419391206313416, "grad_norm": 0.07372091710567474, "learning_rate": 2.92630548158156e-05, "loss": 0.6674, "num_input_tokens_seen": 102760448, "step": 49 }, { "epoch": 0.4509582863585118, "grad_norm": 0.07131503522396088, "learning_rate": 2.8557870956832132e-05, "loss": 0.6635, "num_input_tokens_seen": 104857600, "step": 50 }, { "epoch": 0.4599774520856821, "grad_norm": 0.0702953040599823, "learning_rate": 2.7849785247263515e-05, "loss": 0.6598, "num_input_tokens_seen": 106954752, "step": 51 }, { "epoch": 0.4689966178128523, "grad_norm": 0.06502864509820938, "learning_rate": 2.7139375211970996e-05, "loss": 0.6732, "num_input_tokens_seen": 109051904, "step": 52 }, { "epoch": 0.47801578354002255, "grad_norm": 0.06253138929605484, "learning_rate": 2.6427220271569203e-05, "loss": 0.6546, "num_input_tokens_seen": 111149056, "step": 53 }, { "epoch": 0.48703494926719276, "grad_norm": 0.06724034994840622, "learning_rate": 2.5713901269842404e-05, "loss": 0.6705, "num_input_tokens_seen": 113246208, "step": 54 }, { "epoch": 0.496054114994363, "grad_norm": 0.06660095602273941, "learning_rate": 2.5e-05, "loss": 0.6463, "num_input_tokens_seen": 115343360, "step": 55 }, { "epoch": 0.5050732807215332, "grad_norm": 0.06747590005397797, "learning_rate": 2.42860987301576e-05, "loss": 0.6505, "num_input_tokens_seen": 117440512, "step": 56 }, { "epoch": 0.5140924464487034, "grad_norm": 0.07006718963384628, "learning_rate": 2.35727797284308e-05, "loss": 0.6524, "num_input_tokens_seen": 119537664, "step": 57 }, { "epoch": 0.5231116121758738, "grad_norm": 0.06662806868553162, "learning_rate": 2.2860624788029013e-05, "loss": 0.6559, "num_input_tokens_seen": 121634816, "step": 58 }, { "epoch": 0.532130777903044, "grad_norm": 0.06757567077875137, "learning_rate": 2.2150214752736488e-05, "loss": 0.6511, "num_input_tokens_seen": 123731968, "step": 59 }, { "epoch": 0.5411499436302142, "grad_norm": 0.07193508744239807, "learning_rate": 2.1442129043167874e-05, "loss": 0.6457, "num_input_tokens_seen": 125829120, "step": 60 }, { "epoch": 0.5501691093573844, "grad_norm": 0.06620261073112488, "learning_rate": 2.0736945184184405e-05, "loss": 0.6492, "num_input_tokens_seen": 127926272, "step": 61 }, { "epoch": 0.5591882750845547, "grad_norm": 0.06846100836992264, "learning_rate": 2.003523833385637e-05, "loss": 0.6539, "num_input_tokens_seen": 130023424, "step": 62 }, { "epoch": 0.5682074408117249, "grad_norm": 0.06885959208011627, "learning_rate": 1.9337580814355888e-05, "loss": 0.6417, "num_input_tokens_seen": 132120576, "step": 63 }, { "epoch": 0.5772266065388951, "grad_norm": 0.06715461611747742, "learning_rate": 1.8644541645162834e-05, "loss": 0.6663, "num_input_tokens_seen": 134217728, "step": 64 }, { "epoch": 0.5862457722660653, "grad_norm": 0.06593496352434158, "learning_rate": 1.795668607896426e-05, "loss": 0.6572, "num_input_tokens_seen": 136314880, "step": 65 }, { "epoch": 0.5952649379932357, "grad_norm": 0.06741371005773544, "learning_rate": 1.7274575140626318e-05, "loss": 0.6825, "num_input_tokens_seen": 138412032, "step": 66 }, { "epoch": 0.6042841037204059, "grad_norm": 0.06627509742975235, "learning_rate": 1.6598765169614243e-05, "loss": 0.6509, "num_input_tokens_seen": 140509184, "step": 67 }, { "epoch": 0.6133032694475761, "grad_norm": 0.06725791096687317, "learning_rate": 1.5929807366233977e-05, "loss": 0.6619, "num_input_tokens_seen": 142606336, "step": 68 }, { "epoch": 0.6223224351747464, "grad_norm": 0.06500604748725891, "learning_rate": 1.5268247342065215e-05, "loss": 0.6759, "num_input_tokens_seen": 144703488, "step": 69 }, { "epoch": 0.6313416009019166, "grad_norm": 0.07281672209501266, "learning_rate": 1.4614624674952842e-05, "loss": 0.6568, "num_input_tokens_seen": 146800640, "step": 70 }, { "epoch": 0.6403607666290868, "grad_norm": 0.0661671832203865, "learning_rate": 1.3969472468919461e-05, "loss": 0.6472, "num_input_tokens_seen": 148897792, "step": 71 }, { "epoch": 0.649379932356257, "grad_norm": 0.06678120791912079, "learning_rate": 1.3333316919358157e-05, "loss": 0.6473, "num_input_tokens_seen": 150994944, "step": 72 }, { "epoch": 0.6583990980834273, "grad_norm": 0.06216076388955116, "learning_rate": 1.2706676883859903e-05, "loss": 0.6485, "num_input_tokens_seen": 153092096, "step": 73 }, { "epoch": 0.6674182638105975, "grad_norm": 0.06877604126930237, "learning_rate": 1.2090063459025955e-05, "loss": 0.6544, "num_input_tokens_seen": 155189248, "step": 74 }, { "epoch": 0.6764374295377678, "grad_norm": 0.06808489561080933, "learning_rate": 1.148397956361007e-05, "loss": 0.6763, "num_input_tokens_seen": 157286400, "step": 75 }, { "epoch": 0.685456595264938, "grad_norm": 0.06282905489206314, "learning_rate": 1.0888919528330777e-05, "loss": 0.6406, "num_input_tokens_seen": 159383552, "step": 76 }, { "epoch": 0.6944757609921083, "grad_norm": 0.06371884793043137, "learning_rate": 1.0305368692688174e-05, "loss": 0.6502, "num_input_tokens_seen": 161480704, "step": 77 }, { "epoch": 0.7034949267192785, "grad_norm": 0.06734833121299744, "learning_rate": 9.733803009114045e-06, "loss": 0.6495, "num_input_tokens_seen": 163577856, "step": 78 }, { "epoch": 0.7125140924464487, "grad_norm": 0.06442791223526001, "learning_rate": 9.174688654778243e-06, "loss": 0.6469, "num_input_tokens_seen": 165675008, "step": 79 }, { "epoch": 0.7215332581736189, "grad_norm": 0.06670290976762772, "learning_rate": 8.628481651367876e-06, "loss": 0.6642, "num_input_tokens_seen": 167772160, "step": 80 }, { "epoch": 0.7305524239007892, "grad_norm": 0.06524420529603958, "learning_rate": 8.09562749314952e-06, "loss": 0.6598, "num_input_tokens_seen": 169869312, "step": 81 }, { "epoch": 0.7395715896279594, "grad_norm": 0.06147584691643715, "learning_rate": 7.576560783617668e-06, "loss": 0.6461, "num_input_tokens_seen": 171966464, "step": 82 }, { "epoch": 0.7485907553551296, "grad_norm": 0.06680367887020111, "learning_rate": 7.071704881025915e-06, "loss": 0.6706, "num_input_tokens_seen": 174063616, "step": 83 }, { "epoch": 0.7576099210822999, "grad_norm": 0.0657731294631958, "learning_rate": 6.5814715530898745e-06, "loss": 0.6768, "num_input_tokens_seen": 176160768, "step": 84 }, { "epoch": 0.7666290868094702, "grad_norm": 0.06921912729740143, "learning_rate": 6.106260641143546e-06, "loss": 0.6446, "num_input_tokens_seen": 178257920, "step": 85 }, { "epoch": 0.7756482525366404, "grad_norm": 0.06477085500955582, "learning_rate": 5.646459734022938e-06, "loss": 0.6393, "num_input_tokens_seen": 180355072, "step": 86 }, { "epoch": 0.7846674182638106, "grad_norm": 0.06648603081703186, "learning_rate": 5.202443851943126e-06, "loss": 0.6585, "num_input_tokens_seen": 182452224, "step": 87 }, { "epoch": 0.7936865839909808, "grad_norm": 0.06899042427539825, "learning_rate": 4.7745751406263165e-06, "loss": 0.6393, "num_input_tokens_seen": 184549376, "step": 88 }, { "epoch": 0.8027057497181511, "grad_norm": 0.06467007100582123, "learning_rate": 4.36320257593065e-06, "loss": 0.6613, "num_input_tokens_seen": 186646528, "step": 89 }, { "epoch": 0.8117249154453213, "grad_norm": 0.06616765260696411, "learning_rate": 3.968661679220468e-06, "loss": 0.6631, "num_input_tokens_seen": 188743680, "step": 90 }, { "epoch": 0.8207440811724915, "grad_norm": 0.06439518183469772, "learning_rate": 3.591274243710277e-06, "loss": 0.6356, "num_input_tokens_seen": 190840832, "step": 91 }, { "epoch": 0.8297632468996617, "grad_norm": 0.06294091790914536, "learning_rate": 3.2313480720055745e-06, "loss": 0.6475, "num_input_tokens_seen": 192937984, "step": 92 }, { "epoch": 0.8387824126268321, "grad_norm": 0.06395729631185532, "learning_rate": 2.889176725054643e-06, "loss": 0.6387, "num_input_tokens_seen": 195035136, "step": 93 }, { "epoch": 0.8478015783540023, "grad_norm": 0.06324164569377899, "learning_rate": 2.565039282716045e-06, "loss": 0.6533, "num_input_tokens_seen": 197132288, "step": 94 }, { "epoch": 0.8568207440811725, "grad_norm": 0.06285514682531357, "learning_rate": 2.2592001161370392e-06, "loss": 0.6606, "num_input_tokens_seen": 199229440, "step": 95 }, { "epoch": 0.8658399098083427, "grad_norm": 0.06468215584754944, "learning_rate": 1.97190867212875e-06, "loss": 0.6667, "num_input_tokens_seen": 201326592, "step": 96 }, { "epoch": 0.874859075535513, "grad_norm": 0.06431297212839127, "learning_rate": 1.703399269713693e-06, "loss": 0.6599, "num_input_tokens_seen": 203423744, "step": 97 }, { "epoch": 0.8838782412626832, "grad_norm": 0.06363896280527115, "learning_rate": 1.4538909090118846e-06, "loss": 0.6499, "num_input_tokens_seen": 205520896, "step": 98 }, { "epoch": 0.8928974069898534, "grad_norm": 0.06607792526483536, "learning_rate": 1.2235870926211619e-06, "loss": 0.649, "num_input_tokens_seen": 207618048, "step": 99 }, { "epoch": 0.9019165727170236, "grad_norm": 0.06387382745742798, "learning_rate": 1.0126756596375686e-06, "loss": 0.6551, "num_input_tokens_seen": 209715200, "step": 100 }, { "epoch": 0.910935738444194, "grad_norm": 0.07034407556056976, "learning_rate": 8.213286324510738e-07, "loss": 0.668, "num_input_tokens_seen": 211812352, "step": 101 }, { "epoch": 0.9199549041713642, "grad_norm": 0.07087666541337967, "learning_rate": 6.497020764416633e-07, "loss": 0.6505, "num_input_tokens_seen": 213909504, "step": 102 }, { "epoch": 0.9289740698985344, "grad_norm": 0.07003826647996902, "learning_rate": 4.979359726901639e-07, "loss": 0.6504, "num_input_tokens_seen": 216006656, "step": 103 }, { "epoch": 0.9379932356257046, "grad_norm": 0.06608197838068008, "learning_rate": 3.6615410380767544e-07, "loss": 0.6504, "num_input_tokens_seen": 218103808, "step": 104 }, { "epoch": 0.9470124013528749, "grad_norm": 0.0653860792517662, "learning_rate": 2.544639529766829e-07, "loss": 0.6491, "num_input_tokens_seen": 220200960, "step": 105 }, { "epoch": 0.9560315670800451, "grad_norm": 0.06089319288730621, "learning_rate": 1.6295661628624447e-07, "loss": 0.6449, "num_input_tokens_seen": 222298112, "step": 106 }, { "epoch": 0.9650507328072153, "grad_norm": 0.06895752251148224, "learning_rate": 9.170672843271666e-08, "loss": 0.6729, "num_input_tokens_seen": 224395264, "step": 107 }, { "epoch": 0.9740698985343855, "grad_norm": 0.06209622696042061, "learning_rate": 4.07724018466088e-08, "loss": 0.6308, "num_input_tokens_seen": 226492416, "step": 108 }, { "epoch": 0.9830890642615558, "grad_norm": 0.06596114486455917, "learning_rate": 1.0195179295269252e-08, "loss": 0.6695, "num_input_tokens_seen": 228589568, "step": 109 }, { "epoch": 0.992108229988726, "grad_norm": 0.06238327547907829, "learning_rate": 0.0, "loss": 0.6467, "num_input_tokens_seen": 230686720, "step": 110 }, { "epoch": 0.992108229988726, "num_input_tokens_seen": 230686720, "step": 110, "total_flos": 9.786587384895242e+18, "train_loss": 0.6728460962122137, "train_runtime": 17352.9545, "train_samples_per_second": 3.271, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 110, "num_input_tokens_seen": 230686720, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.786587384895242e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }