{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.92988929889299, "eval_steps": 500, "global_step": 1350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007380073800738007, "grad_norm": 6.363606224312279, "learning_rate": 0.0, "loss": 1.4914, "num_tokens": 108129.0, "step": 1 }, { "epoch": 0.014760147601476014, "grad_norm": 6.012702836892834, "learning_rate": 2.439024390243903e-07, "loss": 1.4111, "num_tokens": 262871.0, "step": 2 }, { "epoch": 0.02214022140221402, "grad_norm": 6.615048103988674, "learning_rate": 4.878048780487805e-07, "loss": 1.525, "num_tokens": 383320.0, "step": 3 }, { "epoch": 0.02952029520295203, "grad_norm": 6.205874266530899, "learning_rate": 7.317073170731707e-07, "loss": 1.5202, "num_tokens": 486579.0, "step": 4 }, { "epoch": 0.03690036900369004, "grad_norm": 6.790213435426661, "learning_rate": 9.75609756097561e-07, "loss": 1.5711, "num_tokens": 576134.0, "step": 5 }, { "epoch": 0.04428044280442804, "grad_norm": 6.316668195961204, "learning_rate": 1.2195121951219514e-06, "loss": 1.4689, "num_tokens": 686509.0, "step": 6 }, { "epoch": 0.05166051660516605, "grad_norm": 5.938905378037534, "learning_rate": 1.4634146341463414e-06, "loss": 1.4945, "num_tokens": 784231.0, "step": 7 }, { "epoch": 0.05904059040590406, "grad_norm": 4.429329163711381, "learning_rate": 1.707317073170732e-06, "loss": 1.5244, "num_tokens": 882871.0, "step": 8 }, { "epoch": 0.06642066420664207, "grad_norm": 4.195021375348304, "learning_rate": 1.951219512195122e-06, "loss": 1.4425, "num_tokens": 1005895.0, "step": 9 }, { "epoch": 0.07380073800738007, "grad_norm": 2.885978260862124, "learning_rate": 2.1951219512195125e-06, "loss": 1.4326, "num_tokens": 1105351.0, "step": 10 }, { "epoch": 0.08118081180811808, "grad_norm": 2.573831231053168, "learning_rate": 2.4390243902439027e-06, "loss": 1.3451, "num_tokens": 1226716.0, "step": 11 }, { "epoch": 0.08856088560885608, "grad_norm": 2.146494562191251, "learning_rate": 2.682926829268293e-06, "loss": 1.3703, "num_tokens": 1353599.0, "step": 12 }, { "epoch": 0.0959409594095941, "grad_norm": 1.5117677326527352, "learning_rate": 2.926829268292683e-06, "loss": 1.171, "num_tokens": 1461850.0, "step": 13 }, { "epoch": 0.1033210332103321, "grad_norm": 2.5271220924968527, "learning_rate": 3.1707317073170736e-06, "loss": 1.2756, "num_tokens": 1558756.0, "step": 14 }, { "epoch": 0.11070110701107011, "grad_norm": 1.838325022665236, "learning_rate": 3.414634146341464e-06, "loss": 1.1876, "num_tokens": 1663867.0, "step": 15 }, { "epoch": 0.11808118081180811, "grad_norm": 1.8568958719583475, "learning_rate": 3.6585365853658537e-06, "loss": 1.2399, "num_tokens": 1766057.0, "step": 16 }, { "epoch": 0.12546125461254612, "grad_norm": 1.3695578003158202, "learning_rate": 3.902439024390244e-06, "loss": 1.23, "num_tokens": 1869252.0, "step": 17 }, { "epoch": 0.13284132841328414, "grad_norm": 1.153225570030305, "learning_rate": 4.146341463414634e-06, "loss": 1.1294, "num_tokens": 1957924.0, "step": 18 }, { "epoch": 0.14022140221402213, "grad_norm": 0.7456771122232927, "learning_rate": 4.390243902439025e-06, "loss": 0.9469, "num_tokens": 2097152.0, "step": 19 }, { "epoch": 0.14760147601476015, "grad_norm": 1.1465877970965435, "learning_rate": 4.634146341463416e-06, "loss": 1.1601, "num_tokens": 2226077.0, "step": 20 }, { "epoch": 0.15498154981549817, "grad_norm": 0.7815788812110701, "learning_rate": 4.8780487804878055e-06, "loss": 1.0533, "num_tokens": 2363480.0, "step": 21 }, { "epoch": 0.16236162361623616, "grad_norm": 0.7732486130250715, "learning_rate": 5.121951219512195e-06, "loss": 1.5139, "num_tokens": 2543914.0, "step": 22 }, { "epoch": 0.16974169741697417, "grad_norm": 0.7060278478886041, "learning_rate": 5.365853658536586e-06, "loss": 1.0774, "num_tokens": 2677305.0, "step": 23 }, { "epoch": 0.17712177121771217, "grad_norm": 0.8930074024117203, "learning_rate": 5.609756097560977e-06, "loss": 1.1666, "num_tokens": 2774583.0, "step": 24 }, { "epoch": 0.18450184501845018, "grad_norm": 0.7498436046258535, "learning_rate": 5.853658536585366e-06, "loss": 1.1053, "num_tokens": 2883688.0, "step": 25 }, { "epoch": 0.1918819188191882, "grad_norm": 0.8912766185518683, "learning_rate": 6.0975609756097564e-06, "loss": 1.13, "num_tokens": 2995769.0, "step": 26 }, { "epoch": 0.1992619926199262, "grad_norm": 0.9011384648250562, "learning_rate": 6.341463414634147e-06, "loss": 1.0441, "num_tokens": 3082040.0, "step": 27 }, { "epoch": 0.2066420664206642, "grad_norm": 0.8737960258241075, "learning_rate": 6.585365853658538e-06, "loss": 1.1585, "num_tokens": 3175114.0, "step": 28 }, { "epoch": 0.2140221402214022, "grad_norm": 0.7296535111218418, "learning_rate": 6.829268292682928e-06, "loss": 1.0948, "num_tokens": 3285127.0, "step": 29 }, { "epoch": 0.22140221402214022, "grad_norm": 0.7274693928818707, "learning_rate": 7.0731707317073175e-06, "loss": 1.0835, "num_tokens": 3380009.0, "step": 30 }, { "epoch": 0.22878228782287824, "grad_norm": 0.6088521526135358, "learning_rate": 7.317073170731707e-06, "loss": 1.0209, "num_tokens": 3535375.0, "step": 31 }, { "epoch": 0.23616236162361623, "grad_norm": 0.7018449321520274, "learning_rate": 7.560975609756098e-06, "loss": 1.046, "num_tokens": 3634648.0, "step": 32 }, { "epoch": 0.24354243542435425, "grad_norm": 0.6536374431603196, "learning_rate": 7.804878048780489e-06, "loss": 1.1203, "num_tokens": 3739441.0, "step": 33 }, { "epoch": 0.25092250922509224, "grad_norm": 0.51602945023754, "learning_rate": 8.048780487804879e-06, "loss": 1.1216, "num_tokens": 3901324.0, "step": 34 }, { "epoch": 0.25830258302583026, "grad_norm": 0.6465805158894952, "learning_rate": 8.292682926829268e-06, "loss": 1.091, "num_tokens": 4028500.0, "step": 35 }, { "epoch": 0.2656826568265683, "grad_norm": 0.6651254191562199, "learning_rate": 8.536585365853658e-06, "loss": 1.0302, "num_tokens": 4122817.0, "step": 36 }, { "epoch": 0.2730627306273063, "grad_norm": 0.6521370198607039, "learning_rate": 8.78048780487805e-06, "loss": 1.04, "num_tokens": 4224938.0, "step": 37 }, { "epoch": 0.28044280442804426, "grad_norm": 0.6007174831688549, "learning_rate": 9.02439024390244e-06, "loss": 1.0265, "num_tokens": 4333124.0, "step": 38 }, { "epoch": 0.2878228782287823, "grad_norm": 0.551059225568472, "learning_rate": 9.268292682926831e-06, "loss": 1.0287, "num_tokens": 4456232.0, "step": 39 }, { "epoch": 0.2952029520295203, "grad_norm": 0.6437675813439927, "learning_rate": 9.51219512195122e-06, "loss": 1.1779, "num_tokens": 4542519.0, "step": 40 }, { "epoch": 0.3025830258302583, "grad_norm": 0.480473097605294, "learning_rate": 9.756097560975611e-06, "loss": 0.8197, "num_tokens": 4687640.0, "step": 41 }, { "epoch": 0.30996309963099633, "grad_norm": 0.5120008356671397, "learning_rate": 1e-05, "loss": 0.9641, "num_tokens": 4789703.0, "step": 42 }, { "epoch": 0.3173431734317343, "grad_norm": 0.4568876499652346, "learning_rate": 9.999987040066834e-06, "loss": 1.0052, "num_tokens": 4915970.0, "step": 43 }, { "epoch": 0.3247232472324723, "grad_norm": 0.5913746713257689, "learning_rate": 9.99994816034198e-06, "loss": 1.059, "num_tokens": 5017958.0, "step": 44 }, { "epoch": 0.33210332103321033, "grad_norm": 0.46500058691449936, "learning_rate": 9.999883361049389e-06, "loss": 1.012, "num_tokens": 5122962.0, "step": 45 }, { "epoch": 0.33948339483394835, "grad_norm": 0.5570928695789232, "learning_rate": 9.999792642562297e-06, "loss": 0.9646, "num_tokens": 5218400.0, "step": 46 }, { "epoch": 0.34686346863468637, "grad_norm": 0.5153854617107293, "learning_rate": 9.999676005403246e-06, "loss": 1.0834, "num_tokens": 5321991.0, "step": 47 }, { "epoch": 0.35424354243542433, "grad_norm": 0.49149562713241135, "learning_rate": 9.99953345024406e-06, "loss": 0.9621, "num_tokens": 5422853.0, "step": 48 }, { "epoch": 0.36162361623616235, "grad_norm": 0.5615732057290855, "learning_rate": 9.99936497790585e-06, "loss": 1.0506, "num_tokens": 5515879.0, "step": 49 }, { "epoch": 0.36900369003690037, "grad_norm": 0.571733530601376, "learning_rate": 9.999170589359015e-06, "loss": 1.0696, "num_tokens": 5623737.0, "step": 50 }, { "epoch": 0.3763837638376384, "grad_norm": 0.5032065722860658, "learning_rate": 9.998950285723228e-06, "loss": 0.9468, "num_tokens": 5740532.0, "step": 51 }, { "epoch": 0.3837638376383764, "grad_norm": 0.5659681942216157, "learning_rate": 9.998704068267427e-06, "loss": 1.1049, "num_tokens": 5847705.0, "step": 52 }, { "epoch": 0.39114391143911437, "grad_norm": 0.4782117232104864, "learning_rate": 9.998431938409822e-06, "loss": 1.0344, "num_tokens": 5978117.0, "step": 53 }, { "epoch": 0.3985239852398524, "grad_norm": 0.4647019175418653, "learning_rate": 9.998133897717868e-06, "loss": 0.9418, "num_tokens": 6095872.0, "step": 54 }, { "epoch": 0.4059040590405904, "grad_norm": 0.5143036254859215, "learning_rate": 9.997809947908275e-06, "loss": 1.0138, "num_tokens": 6197245.0, "step": 55 }, { "epoch": 0.4132841328413284, "grad_norm": 0.4808656273298818, "learning_rate": 9.997460090846982e-06, "loss": 0.9426, "num_tokens": 6302933.0, "step": 56 }, { "epoch": 0.42066420664206644, "grad_norm": 0.5754826460700532, "learning_rate": 9.997084328549156e-06, "loss": 1.086, "num_tokens": 6395435.0, "step": 57 }, { "epoch": 0.4280442804428044, "grad_norm": 0.5614265194817842, "learning_rate": 9.996682663179175e-06, "loss": 1.0444, "num_tokens": 6509555.0, "step": 58 }, { "epoch": 0.4354243542435424, "grad_norm": 0.5712826432269551, "learning_rate": 9.996255097050624e-06, "loss": 1.0086, "num_tokens": 6617826.0, "step": 59 }, { "epoch": 0.44280442804428044, "grad_norm": 0.563978826243104, "learning_rate": 9.995801632626267e-06, "loss": 1.0746, "num_tokens": 6715676.0, "step": 60 }, { "epoch": 0.45018450184501846, "grad_norm": 0.5513032046654982, "learning_rate": 9.995322272518046e-06, "loss": 1.0607, "num_tokens": 6809492.0, "step": 61 }, { "epoch": 0.4575645756457565, "grad_norm": 0.4974539118984009, "learning_rate": 9.994817019487061e-06, "loss": 0.952, "num_tokens": 6914850.0, "step": 62 }, { "epoch": 0.46494464944649444, "grad_norm": 0.438511917566343, "learning_rate": 9.994285876443558e-06, "loss": 0.9537, "num_tokens": 7039180.0, "step": 63 }, { "epoch": 0.47232472324723246, "grad_norm": 0.5378396549591032, "learning_rate": 9.993728846446903e-06, "loss": 1.0574, "num_tokens": 7128575.0, "step": 64 }, { "epoch": 0.4797047970479705, "grad_norm": 0.6082465859999698, "learning_rate": 9.993145932705569e-06, "loss": 1.0524, "num_tokens": 7217972.0, "step": 65 }, { "epoch": 0.4870848708487085, "grad_norm": 0.5103591233335351, "learning_rate": 9.992537138577125e-06, "loss": 1.0469, "num_tokens": 7354135.0, "step": 66 }, { "epoch": 0.4944649446494465, "grad_norm": 0.4621068253841004, "learning_rate": 9.991902467568208e-06, "loss": 0.9382, "num_tokens": 7471432.0, "step": 67 }, { "epoch": 0.5018450184501845, "grad_norm": 0.45732749008487794, "learning_rate": 9.991241923334503e-06, "loss": 0.9203, "num_tokens": 7590339.0, "step": 68 }, { "epoch": 0.5092250922509225, "grad_norm": 0.4662000389870386, "learning_rate": 9.990555509680723e-06, "loss": 0.9102, "num_tokens": 7711707.0, "step": 69 }, { "epoch": 0.5166051660516605, "grad_norm": 0.43875655100732047, "learning_rate": 9.989843230560593e-06, "loss": 0.8622, "num_tokens": 7826782.0, "step": 70 }, { "epoch": 0.5239852398523985, "grad_norm": 0.5489110482145291, "learning_rate": 9.98910509007682e-06, "loss": 1.0403, "num_tokens": 7976473.0, "step": 71 }, { "epoch": 0.5313653136531366, "grad_norm": 0.4574193609141772, "learning_rate": 9.988341092481069e-06, "loss": 0.9335, "num_tokens": 8115460.0, "step": 72 }, { "epoch": 0.5387453874538746, "grad_norm": 0.49070862262475273, "learning_rate": 9.987551242173945e-06, "loss": 1.294, "num_tokens": 8250050.0, "step": 73 }, { "epoch": 0.5461254612546126, "grad_norm": 0.4234076913104516, "learning_rate": 9.986735543704961e-06, "loss": 0.9397, "num_tokens": 8388396.0, "step": 74 }, { "epoch": 0.5535055350553506, "grad_norm": 0.46710664453501544, "learning_rate": 9.985894001772519e-06, "loss": 1.0268, "num_tokens": 8519930.0, "step": 75 }, { "epoch": 0.5608856088560885, "grad_norm": 0.555996347697104, "learning_rate": 9.98502662122387e-06, "loss": 1.0803, "num_tokens": 8613426.0, "step": 76 }, { "epoch": 0.5682656826568265, "grad_norm": 0.5211423653500821, "learning_rate": 9.984133407055105e-06, "loss": 1.0403, "num_tokens": 8711173.0, "step": 77 }, { "epoch": 0.5756457564575646, "grad_norm": 0.5516158654172905, "learning_rate": 9.98321436441111e-06, "loss": 1.1208, "num_tokens": 8807954.0, "step": 78 }, { "epoch": 0.5830258302583026, "grad_norm": 0.5539639014046897, "learning_rate": 9.982269498585542e-06, "loss": 1.0325, "num_tokens": 8903831.0, "step": 79 }, { "epoch": 0.5904059040590406, "grad_norm": 0.575542238443959, "learning_rate": 9.981298815020804e-06, "loss": 1.0463, "num_tokens": 9003846.0, "step": 80 }, { "epoch": 0.5977859778597786, "grad_norm": 0.45423267003556045, "learning_rate": 9.980302319307998e-06, "loss": 1.0481, "num_tokens": 9130236.0, "step": 81 }, { "epoch": 0.6051660516605166, "grad_norm": 0.463031467112172, "learning_rate": 9.979280017186915e-06, "loss": 1.0374, "num_tokens": 9256557.0, "step": 82 }, { "epoch": 0.6125461254612546, "grad_norm": 0.4416973693733997, "learning_rate": 9.978231914545981e-06, "loss": 0.9264, "num_tokens": 9373670.0, "step": 83 }, { "epoch": 0.6199261992619927, "grad_norm": 0.5238508886142513, "learning_rate": 9.977158017422241e-06, "loss": 1.0262, "num_tokens": 9491588.0, "step": 84 }, { "epoch": 0.6273062730627307, "grad_norm": 0.442540814989928, "learning_rate": 9.976058332001307e-06, "loss": 0.9437, "num_tokens": 9616906.0, "step": 85 }, { "epoch": 0.6346863468634686, "grad_norm": 0.5112364524346023, "learning_rate": 9.974932864617333e-06, "loss": 1.0335, "num_tokens": 9723859.0, "step": 86 }, { "epoch": 0.6420664206642066, "grad_norm": 0.5358861902240765, "learning_rate": 9.973781621752982e-06, "loss": 1.2516, "num_tokens": 9839708.0, "step": 87 }, { "epoch": 0.6494464944649446, "grad_norm": 0.5382936334494667, "learning_rate": 9.972604610039376e-06, "loss": 1.0241, "num_tokens": 9934614.0, "step": 88 }, { "epoch": 0.6568265682656826, "grad_norm": 0.4511647212099882, "learning_rate": 9.971401836256066e-06, "loss": 1.0173, "num_tokens": 10052192.0, "step": 89 }, { "epoch": 0.6642066420664207, "grad_norm": 0.6209524506305847, "learning_rate": 9.970173307330998e-06, "loss": 1.1109, "num_tokens": 10131429.0, "step": 90 }, { "epoch": 0.6715867158671587, "grad_norm": 0.4760295224336866, "learning_rate": 9.968919030340458e-06, "loss": 0.9765, "num_tokens": 10252935.0, "step": 91 }, { "epoch": 0.6789667896678967, "grad_norm": 0.632070574170874, "learning_rate": 9.967639012509046e-06, "loss": 1.0778, "num_tokens": 10356066.0, "step": 92 }, { "epoch": 0.6863468634686347, "grad_norm": 0.4819917337977906, "learning_rate": 9.966333261209625e-06, "loss": 0.957, "num_tokens": 10466192.0, "step": 93 }, { "epoch": 0.6937269372693727, "grad_norm": 0.5066375366228465, "learning_rate": 9.965001783963287e-06, "loss": 1.0014, "num_tokens": 10569535.0, "step": 94 }, { "epoch": 0.7011070110701108, "grad_norm": 0.450509316551867, "learning_rate": 9.963644588439297e-06, "loss": 0.9054, "num_tokens": 10688885.0, "step": 95 }, { "epoch": 0.7084870848708487, "grad_norm": 0.44224377901493983, "learning_rate": 9.962261682455065e-06, "loss": 1.35, "num_tokens": 10856147.0, "step": 96 }, { "epoch": 0.7158671586715867, "grad_norm": 0.4777650383162576, "learning_rate": 9.960853073976086e-06, "loss": 1.0102, "num_tokens": 10960833.0, "step": 97 }, { "epoch": 0.7232472324723247, "grad_norm": 0.5398453918419668, "learning_rate": 9.959418771115904e-06, "loss": 1.0557, "num_tokens": 11062207.0, "step": 98 }, { "epoch": 0.7306273062730627, "grad_norm": 0.530337299319452, "learning_rate": 9.95795878213606e-06, "loss": 0.9023, "num_tokens": 11171307.0, "step": 99 }, { "epoch": 0.7380073800738007, "grad_norm": 0.5622636104830259, "learning_rate": 9.956473115446049e-06, "loss": 1.0633, "num_tokens": 11254005.0, "step": 100 }, { "epoch": 0.7453874538745388, "grad_norm": 0.46456704334968907, "learning_rate": 9.954961779603264e-06, "loss": 0.927, "num_tokens": 11394665.0, "step": 101 }, { "epoch": 0.7527675276752768, "grad_norm": 0.5488257951604908, "learning_rate": 9.953424783312957e-06, "loss": 1.0226, "num_tokens": 11476158.0, "step": 102 }, { "epoch": 0.7601476014760148, "grad_norm": 0.4300034767733986, "learning_rate": 9.95186213542818e-06, "loss": 1.0092, "num_tokens": 11600918.0, "step": 103 }, { "epoch": 0.7675276752767528, "grad_norm": 0.4938744510670259, "learning_rate": 9.950273844949737e-06, "loss": 0.9517, "num_tokens": 11709035.0, "step": 104 }, { "epoch": 0.7749077490774908, "grad_norm": 0.5036060443814918, "learning_rate": 9.948659921026139e-06, "loss": 0.9781, "num_tokens": 11802949.0, "step": 105 }, { "epoch": 0.7822878228782287, "grad_norm": 0.5610444858635553, "learning_rate": 9.947020372953533e-06, "loss": 1.0902, "num_tokens": 11890336.0, "step": 106 }, { "epoch": 0.7896678966789668, "grad_norm": 0.5163528393079438, "learning_rate": 9.945355210175673e-06, "loss": 0.9623, "num_tokens": 11984884.0, "step": 107 }, { "epoch": 0.7970479704797048, "grad_norm": 0.5159615372931292, "learning_rate": 9.943664442283845e-06, "loss": 1.0683, "num_tokens": 12077229.0, "step": 108 }, { "epoch": 0.8044280442804428, "grad_norm": 0.5130347704349766, "learning_rate": 9.94194807901682e-06, "loss": 1.0055, "num_tokens": 12173116.0, "step": 109 }, { "epoch": 0.8118081180811808, "grad_norm": 0.4554463222781016, "learning_rate": 9.9402061302608e-06, "loss": 0.97, "num_tokens": 12280224.0, "step": 110 }, { "epoch": 0.8191881918819188, "grad_norm": 0.49800466527382514, "learning_rate": 9.938438606049362e-06, "loss": 1.0198, "num_tokens": 12377281.0, "step": 111 }, { "epoch": 0.8265682656826568, "grad_norm": 0.6048015082550192, "learning_rate": 9.936645516563387e-06, "loss": 1.0646, "num_tokens": 12453177.0, "step": 112 }, { "epoch": 0.8339483394833949, "grad_norm": 0.5217564431490526, "learning_rate": 9.934826872131024e-06, "loss": 1.1159, "num_tokens": 12547604.0, "step": 113 }, { "epoch": 0.8413284132841329, "grad_norm": 0.5421231223025501, "learning_rate": 9.932982683227606e-06, "loss": 1.1143, "num_tokens": 12637010.0, "step": 114 }, { "epoch": 0.8487084870848709, "grad_norm": 0.47714060338286013, "learning_rate": 9.931112960475606e-06, "loss": 0.941, "num_tokens": 12740899.0, "step": 115 }, { "epoch": 0.8560885608856088, "grad_norm": 0.5526084769825411, "learning_rate": 9.929217714644574e-06, "loss": 0.9838, "num_tokens": 12837300.0, "step": 116 }, { "epoch": 0.8634686346863468, "grad_norm": 0.4666579296080565, "learning_rate": 9.927296956651069e-06, "loss": 0.907, "num_tokens": 12945601.0, "step": 117 }, { "epoch": 0.8708487084870848, "grad_norm": 0.4740058144949936, "learning_rate": 9.925350697558598e-06, "loss": 0.9664, "num_tokens": 13052482.0, "step": 118 }, { "epoch": 0.8782287822878229, "grad_norm": 0.47431280302190226, "learning_rate": 9.92337894857756e-06, "loss": 1.0038, "num_tokens": 13183801.0, "step": 119 }, { "epoch": 0.8856088560885609, "grad_norm": 0.526188736768355, "learning_rate": 9.921381721065164e-06, "loss": 0.9861, "num_tokens": 13290010.0, "step": 120 }, { "epoch": 0.8929889298892989, "grad_norm": 0.5201598156189171, "learning_rate": 9.919359026525389e-06, "loss": 0.9604, "num_tokens": 13392353.0, "step": 121 }, { "epoch": 0.9003690036900369, "grad_norm": 0.48530077756820317, "learning_rate": 9.91731087660889e-06, "loss": 0.9916, "num_tokens": 13499246.0, "step": 122 }, { "epoch": 0.9077490774907749, "grad_norm": 0.515320285644199, "learning_rate": 9.91523728311295e-06, "loss": 1.0087, "num_tokens": 13584502.0, "step": 123 }, { "epoch": 0.915129151291513, "grad_norm": 0.46818818518124683, "learning_rate": 9.913138257981408e-06, "loss": 0.9929, "num_tokens": 13710703.0, "step": 124 }, { "epoch": 0.922509225092251, "grad_norm": 0.46147504674366924, "learning_rate": 9.911013813304584e-06, "loss": 0.9492, "num_tokens": 13827190.0, "step": 125 }, { "epoch": 0.9298892988929889, "grad_norm": 0.44841838892898456, "learning_rate": 9.90886396131922e-06, "loss": 0.9587, "num_tokens": 13942355.0, "step": 126 }, { "epoch": 0.9372693726937269, "grad_norm": 0.4667768028159005, "learning_rate": 9.906688714408396e-06, "loss": 0.9677, "num_tokens": 14045098.0, "step": 127 }, { "epoch": 0.9446494464944649, "grad_norm": 0.47864280022738454, "learning_rate": 9.904488085101472e-06, "loss": 0.9383, "num_tokens": 14138298.0, "step": 128 }, { "epoch": 0.9520295202952029, "grad_norm": 0.5148983443269514, "learning_rate": 9.902262086074005e-06, "loss": 1.0295, "num_tokens": 14256039.0, "step": 129 }, { "epoch": 0.959409594095941, "grad_norm": 0.5093081614806382, "learning_rate": 9.900010730147685e-06, "loss": 0.9741, "num_tokens": 14377661.0, "step": 130 }, { "epoch": 0.966789667896679, "grad_norm": 0.4647124416526912, "learning_rate": 9.897734030290254e-06, "loss": 0.9618, "num_tokens": 14497517.0, "step": 131 }, { "epoch": 0.974169741697417, "grad_norm": 0.46313506939147736, "learning_rate": 9.895431999615436e-06, "loss": 1.0342, "num_tokens": 14622979.0, "step": 132 }, { "epoch": 0.981549815498155, "grad_norm": 0.40892038773527534, "learning_rate": 9.893104651382863e-06, "loss": 0.912, "num_tokens": 14753899.0, "step": 133 }, { "epoch": 0.988929889298893, "grad_norm": 0.566168820425785, "learning_rate": 9.890751998997986e-06, "loss": 1.0993, "num_tokens": 14834866.0, "step": 134 }, { "epoch": 0.996309963099631, "grad_norm": 0.4592197488603727, "learning_rate": 9.888374056012016e-06, "loss": 0.9305, "num_tokens": 14952619.0, "step": 135 }, { "epoch": 1.0, "grad_norm": 0.4592197488603727, "learning_rate": 9.885970836121833e-06, "loss": 1.037, "num_tokens": 15018208.0, "step": 136 }, { "epoch": 1.007380073800738, "grad_norm": 0.7989877914889159, "learning_rate": 9.88354235316991e-06, "loss": 0.9263, "num_tokens": 15133428.0, "step": 137 }, { "epoch": 1.014760147601476, "grad_norm": 0.49591188266357994, "learning_rate": 9.881088621144242e-06, "loss": 0.8977, "num_tokens": 15231140.0, "step": 138 }, { "epoch": 1.022140221402214, "grad_norm": 0.4543003054132371, "learning_rate": 9.87860965417825e-06, "loss": 0.8595, "num_tokens": 15374666.0, "step": 139 }, { "epoch": 1.029520295202952, "grad_norm": 0.4585183506218285, "learning_rate": 9.876105466550708e-06, "loss": 0.9014, "num_tokens": 15481846.0, "step": 140 }, { "epoch": 1.03690036900369, "grad_norm": 0.43441901355258156, "learning_rate": 9.873576072685665e-06, "loss": 0.9625, "num_tokens": 15599370.0, "step": 141 }, { "epoch": 1.044280442804428, "grad_norm": 0.4461158736534833, "learning_rate": 9.871021487152353e-06, "loss": 0.9303, "num_tokens": 15720156.0, "step": 142 }, { "epoch": 1.051660516605166, "grad_norm": 0.7840474037444799, "learning_rate": 9.86844172466511e-06, "loss": 0.8877, "num_tokens": 15829906.0, "step": 143 }, { "epoch": 1.0590405904059041, "grad_norm": 0.4502214845749829, "learning_rate": 9.865836800083291e-06, "loss": 0.8896, "num_tokens": 15940844.0, "step": 144 }, { "epoch": 1.066420664206642, "grad_norm": 0.4981431652238191, "learning_rate": 9.863206728411184e-06, "loss": 0.82, "num_tokens": 16047950.0, "step": 145 }, { "epoch": 1.07380073800738, "grad_norm": 0.4894324276236524, "learning_rate": 9.860551524797922e-06, "loss": 0.8665, "num_tokens": 16162047.0, "step": 146 }, { "epoch": 1.081180811808118, "grad_norm": 0.4757379804176477, "learning_rate": 9.857871204537403e-06, "loss": 0.8054, "num_tokens": 16282646.0, "step": 147 }, { "epoch": 1.088560885608856, "grad_norm": 0.48374247406787124, "learning_rate": 9.855165783068188e-06, "loss": 0.8617, "num_tokens": 16386527.0, "step": 148 }, { "epoch": 1.0959409594095941, "grad_norm": 0.4336051240979257, "learning_rate": 9.852435275973427e-06, "loss": 0.7327, "num_tokens": 16496015.0, "step": 149 }, { "epoch": 1.103321033210332, "grad_norm": 0.519627688116221, "learning_rate": 9.849679698980757e-06, "loss": 0.9757, "num_tokens": 16582017.0, "step": 150 }, { "epoch": 1.1107011070110702, "grad_norm": 0.4941153232037019, "learning_rate": 9.846899067962223e-06, "loss": 0.8149, "num_tokens": 16697936.0, "step": 151 }, { "epoch": 1.118081180811808, "grad_norm": 0.44527519190153636, "learning_rate": 9.844093398934175e-06, "loss": 0.795, "num_tokens": 16830125.0, "step": 152 }, { "epoch": 1.1254612546125462, "grad_norm": 0.4603797119386462, "learning_rate": 9.841262708057183e-06, "loss": 0.8797, "num_tokens": 16927202.0, "step": 153 }, { "epoch": 1.132841328413284, "grad_norm": 0.5086903095824973, "learning_rate": 9.838407011635944e-06, "loss": 0.8436, "num_tokens": 17036892.0, "step": 154 }, { "epoch": 1.140221402214022, "grad_norm": 0.5071025376685536, "learning_rate": 9.835526326119183e-06, "loss": 0.8922, "num_tokens": 17146176.0, "step": 155 }, { "epoch": 1.1476014760147601, "grad_norm": 0.4637035144739838, "learning_rate": 9.832620668099566e-06, "loss": 0.744, "num_tokens": 17280168.0, "step": 156 }, { "epoch": 1.1549815498154983, "grad_norm": 0.5528942275682165, "learning_rate": 9.829690054313592e-06, "loss": 0.9498, "num_tokens": 17389972.0, "step": 157 }, { "epoch": 1.1623616236162362, "grad_norm": 0.531456746186789, "learning_rate": 9.826734501641512e-06, "loss": 0.863, "num_tokens": 17479848.0, "step": 158 }, { "epoch": 1.169741697416974, "grad_norm": 0.5276513234105304, "learning_rate": 9.823754027107221e-06, "loss": 0.7905, "num_tokens": 17575558.0, "step": 159 }, { "epoch": 1.1771217712177122, "grad_norm": 0.5195819159511271, "learning_rate": 9.820748647878166e-06, "loss": 0.8966, "num_tokens": 17677486.0, "step": 160 }, { "epoch": 1.1845018450184501, "grad_norm": 0.5211095057318215, "learning_rate": 9.81771838126524e-06, "loss": 0.789, "num_tokens": 17777722.0, "step": 161 }, { "epoch": 1.1918819188191883, "grad_norm": 0.46350237811105727, "learning_rate": 9.814663244722689e-06, "loss": 0.8838, "num_tokens": 17906149.0, "step": 162 }, { "epoch": 1.1992619926199262, "grad_norm": 0.44572953162057055, "learning_rate": 9.811583255848005e-06, "loss": 0.7961, "num_tokens": 18031173.0, "step": 163 }, { "epoch": 1.2066420664206643, "grad_norm": 0.4512198146226335, "learning_rate": 9.808478432381841e-06, "loss": 0.8843, "num_tokens": 18134079.0, "step": 164 }, { "epoch": 1.2140221402214022, "grad_norm": 0.482052383393916, "learning_rate": 9.805348792207883e-06, "loss": 0.8385, "num_tokens": 18242380.0, "step": 165 }, { "epoch": 1.2214022140221403, "grad_norm": 0.4637735488794427, "learning_rate": 9.802194353352765e-06, "loss": 0.8381, "num_tokens": 18343696.0, "step": 166 }, { "epoch": 1.2287822878228782, "grad_norm": 0.4184603981198051, "learning_rate": 9.79901513398596e-06, "loss": 0.8284, "num_tokens": 18448780.0, "step": 167 }, { "epoch": 1.2361623616236161, "grad_norm": 0.5906440020601853, "learning_rate": 9.79581115241968e-06, "loss": 0.8404, "num_tokens": 18543863.0, "step": 168 }, { "epoch": 1.2435424354243543, "grad_norm": 0.47936881644553336, "learning_rate": 9.792582427108762e-06, "loss": 0.947, "num_tokens": 18652628.0, "step": 169 }, { "epoch": 1.2509225092250922, "grad_norm": 0.5086167685204926, "learning_rate": 9.789328976650568e-06, "loss": 0.8962, "num_tokens": 18760319.0, "step": 170 }, { "epoch": 1.2583025830258303, "grad_norm": 0.47902043720502113, "learning_rate": 9.786050819784877e-06, "loss": 0.7773, "num_tokens": 18860975.0, "step": 171 }, { "epoch": 1.2656826568265682, "grad_norm": 0.49439604693271255, "learning_rate": 9.782747975393776e-06, "loss": 0.7651, "num_tokens": 18944704.0, "step": 172 }, { "epoch": 1.2730627306273063, "grad_norm": 0.554868265270938, "learning_rate": 9.779420462501548e-06, "loss": 0.9466, "num_tokens": 19057375.0, "step": 173 }, { "epoch": 1.2804428044280443, "grad_norm": 0.5082989408166297, "learning_rate": 9.776068300274568e-06, "loss": 0.8912, "num_tokens": 19175395.0, "step": 174 }, { "epoch": 1.2878228782287824, "grad_norm": 0.5311691180886349, "learning_rate": 9.772691508021194e-06, "loss": 0.869, "num_tokens": 19274879.0, "step": 175 }, { "epoch": 1.2952029520295203, "grad_norm": 0.5001485232951035, "learning_rate": 9.769290105191649e-06, "loss": 0.8186, "num_tokens": 19395971.0, "step": 176 }, { "epoch": 1.3025830258302582, "grad_norm": 0.4993351556405326, "learning_rate": 9.765864111377906e-06, "loss": 0.8036, "num_tokens": 19487184.0, "step": 177 }, { "epoch": 1.3099630996309963, "grad_norm": 0.5117651842979043, "learning_rate": 9.762413546313597e-06, "loss": 0.932, "num_tokens": 19577791.0, "step": 178 }, { "epoch": 1.3173431734317342, "grad_norm": 0.504664400769316, "learning_rate": 9.758938429873867e-06, "loss": 0.8499, "num_tokens": 19680531.0, "step": 179 }, { "epoch": 1.3247232472324724, "grad_norm": 0.5338570837550553, "learning_rate": 9.755438782075285e-06, "loss": 0.8251, "num_tokens": 19784918.0, "step": 180 }, { "epoch": 1.3321033210332103, "grad_norm": 0.4593700755128409, "learning_rate": 9.751914623075724e-06, "loss": 0.8421, "num_tokens": 19885160.0, "step": 181 }, { "epoch": 1.3394833948339484, "grad_norm": 0.6484547228982595, "learning_rate": 9.748365973174228e-06, "loss": 1.2993, "num_tokens": 20025884.0, "step": 182 }, { "epoch": 1.3468634686346863, "grad_norm": 0.5525718219712958, "learning_rate": 9.744792852810916e-06, "loss": 0.8541, "num_tokens": 20117897.0, "step": 183 }, { "epoch": 1.3542435424354244, "grad_norm": 0.4998012568880781, "learning_rate": 9.74119528256686e-06, "loss": 0.8986, "num_tokens": 20211673.0, "step": 184 }, { "epoch": 1.3616236162361623, "grad_norm": 0.5298929870281555, "learning_rate": 9.737573283163952e-06, "loss": 0.9566, "num_tokens": 20356074.0, "step": 185 }, { "epoch": 1.3690036900369003, "grad_norm": 0.5210495050174606, "learning_rate": 9.733926875464805e-06, "loss": 0.8431, "num_tokens": 20507950.0, "step": 186 }, { "epoch": 1.3763837638376384, "grad_norm": 0.46728864361697287, "learning_rate": 9.730256080472618e-06, "loss": 0.8116, "num_tokens": 20610309.0, "step": 187 }, { "epoch": 1.3837638376383765, "grad_norm": 0.4881675871337124, "learning_rate": 9.72656091933106e-06, "loss": 0.8415, "num_tokens": 20716085.0, "step": 188 }, { "epoch": 1.3911439114391144, "grad_norm": 0.4585691063343814, "learning_rate": 9.722841413324148e-06, "loss": 0.9216, "num_tokens": 20824644.0, "step": 189 }, { "epoch": 1.3985239852398523, "grad_norm": 0.5277282283133232, "learning_rate": 9.719097583876131e-06, "loss": 0.8181, "num_tokens": 20951577.0, "step": 190 }, { "epoch": 1.4059040590405905, "grad_norm": 0.4990143013122359, "learning_rate": 9.715329452551351e-06, "loss": 0.867, "num_tokens": 21047084.0, "step": 191 }, { "epoch": 1.4132841328413284, "grad_norm": 0.4971444658278234, "learning_rate": 9.711537041054135e-06, "loss": 0.8007, "num_tokens": 21143609.0, "step": 192 }, { "epoch": 1.4206642066420665, "grad_norm": 0.5042163135566892, "learning_rate": 9.70772037122866e-06, "loss": 0.7865, "num_tokens": 21260697.0, "step": 193 }, { "epoch": 1.4280442804428044, "grad_norm": 0.49618997802851683, "learning_rate": 9.70387946505883e-06, "loss": 0.8052, "num_tokens": 21394978.0, "step": 194 }, { "epoch": 1.4354243542435423, "grad_norm": 0.4394881712817789, "learning_rate": 9.700014344668152e-06, "loss": 0.8518, "num_tokens": 21518161.0, "step": 195 }, { "epoch": 1.4428044280442804, "grad_norm": 0.5185166994477352, "learning_rate": 9.696125032319601e-06, "loss": 0.8927, "num_tokens": 21604465.0, "step": 196 }, { "epoch": 1.4501845018450186, "grad_norm": 0.4853359471099058, "learning_rate": 9.692211550415506e-06, "loss": 0.7539, "num_tokens": 21717937.0, "step": 197 }, { "epoch": 1.4575645756457565, "grad_norm": 0.46363388748721307, "learning_rate": 9.688273921497404e-06, "loss": 0.7983, "num_tokens": 21822988.0, "step": 198 }, { "epoch": 1.4649446494464944, "grad_norm": 0.5486216548585544, "learning_rate": 9.684312168245918e-06, "loss": 0.9988, "num_tokens": 21926485.0, "step": 199 }, { "epoch": 1.4723247232472325, "grad_norm": 0.5953560651449461, "learning_rate": 9.680326313480633e-06, "loss": 0.8636, "num_tokens": 22023009.0, "step": 200 }, { "epoch": 1.4797047970479704, "grad_norm": 0.49361008034993414, "learning_rate": 9.676316380159952e-06, "loss": 0.8099, "num_tokens": 22131849.0, "step": 201 }, { "epoch": 1.4870848708487086, "grad_norm": 0.5002101213490427, "learning_rate": 9.672282391380972e-06, "loss": 0.7703, "num_tokens": 22245342.0, "step": 202 }, { "epoch": 1.4944649446494465, "grad_norm": 0.4785025045933051, "learning_rate": 9.668224370379348e-06, "loss": 0.8508, "num_tokens": 22337769.0, "step": 203 }, { "epoch": 1.5018450184501844, "grad_norm": 0.5435558274748625, "learning_rate": 9.664142340529164e-06, "loss": 0.7785, "num_tokens": 22439058.0, "step": 204 }, { "epoch": 1.5092250922509225, "grad_norm": 0.5532069442290298, "learning_rate": 9.660036325342786e-06, "loss": 0.8826, "num_tokens": 22536430.0, "step": 205 }, { "epoch": 1.5166051660516606, "grad_norm": 0.5307452858748856, "learning_rate": 9.65590634847074e-06, "loss": 0.851, "num_tokens": 22632498.0, "step": 206 }, { "epoch": 1.5239852398523985, "grad_norm": 0.4832687140091762, "learning_rate": 9.651752433701574e-06, "loss": 0.8294, "num_tokens": 22735687.0, "step": 207 }, { "epoch": 1.5313653136531364, "grad_norm": 0.5227010720446035, "learning_rate": 9.64757460496171e-06, "loss": 0.8114, "num_tokens": 22864273.0, "step": 208 }, { "epoch": 1.5387453874538746, "grad_norm": 0.49492647805506507, "learning_rate": 9.64337288631532e-06, "loss": 0.8636, "num_tokens": 22962379.0, "step": 209 }, { "epoch": 1.5461254612546127, "grad_norm": 0.4866765694699883, "learning_rate": 9.639147301964175e-06, "loss": 0.9815, "num_tokens": 23072097.0, "step": 210 }, { "epoch": 1.5535055350553506, "grad_norm": 0.5506302300377369, "learning_rate": 9.63489787624752e-06, "loss": 0.9361, "num_tokens": 23180860.0, "step": 211 }, { "epoch": 1.5608856088560885, "grad_norm": 0.4909442981205813, "learning_rate": 9.630624633641918e-06, "loss": 0.8018, "num_tokens": 23326533.0, "step": 212 }, { "epoch": 1.5682656826568264, "grad_norm": 0.403064429979273, "learning_rate": 9.62632759876112e-06, "loss": 0.7859, "num_tokens": 23446416.0, "step": 213 }, { "epoch": 1.5756457564575646, "grad_norm": 0.5048313215274081, "learning_rate": 9.622006796355918e-06, "loss": 0.7909, "num_tokens": 23543993.0, "step": 214 }, { "epoch": 1.5830258302583027, "grad_norm": 0.46500855134292074, "learning_rate": 9.61766225131401e-06, "loss": 0.8631, "num_tokens": 23678207.0, "step": 215 }, { "epoch": 1.5904059040590406, "grad_norm": 0.5254094413084759, "learning_rate": 9.61329398865984e-06, "loss": 0.8594, "num_tokens": 23810928.0, "step": 216 }, { "epoch": 1.5977859778597785, "grad_norm": 0.49497609979741825, "learning_rate": 9.608902033554476e-06, "loss": 0.8742, "num_tokens": 23917356.0, "step": 217 }, { "epoch": 1.6051660516605166, "grad_norm": 0.5417375416147627, "learning_rate": 9.604486411295446e-06, "loss": 0.8363, "num_tokens": 24035253.0, "step": 218 }, { "epoch": 1.6125461254612548, "grad_norm": 0.48750043128004844, "learning_rate": 9.600047147316605e-06, "loss": 0.7667, "num_tokens": 24146736.0, "step": 219 }, { "epoch": 1.6199261992619927, "grad_norm": 0.4505655779933478, "learning_rate": 9.595584267187981e-06, "loss": 0.7643, "num_tokens": 24253499.0, "step": 220 }, { "epoch": 1.6273062730627306, "grad_norm": 0.5037711126593194, "learning_rate": 9.59109779661563e-06, "loss": 0.8931, "num_tokens": 24356714.0, "step": 221 }, { "epoch": 1.6346863468634685, "grad_norm": 0.552666996933163, "learning_rate": 9.586587761441491e-06, "loss": 0.8319, "num_tokens": 24439630.0, "step": 222 }, { "epoch": 1.6420664206642066, "grad_norm": 0.5247298855240958, "learning_rate": 9.582054187643233e-06, "loss": 0.999, "num_tokens": 24572473.0, "step": 223 }, { "epoch": 1.6494464944649447, "grad_norm": 0.5246643217948993, "learning_rate": 9.577497101334105e-06, "loss": 0.8806, "num_tokens": 24675846.0, "step": 224 }, { "epoch": 1.6568265682656826, "grad_norm": 0.5114996684179052, "learning_rate": 9.572916528762787e-06, "loss": 0.8964, "num_tokens": 24780580.0, "step": 225 }, { "epoch": 1.6642066420664205, "grad_norm": 0.48230639883733895, "learning_rate": 9.568312496313245e-06, "loss": 0.8817, "num_tokens": 24916338.0, "step": 226 }, { "epoch": 1.6715867158671587, "grad_norm": 0.47555962389158385, "learning_rate": 9.563685030504567e-06, "loss": 0.8972, "num_tokens": 25019133.0, "step": 227 }, { "epoch": 1.6789667896678968, "grad_norm": 0.6237052756067296, "learning_rate": 9.559034157990819e-06, "loss": 0.7543, "num_tokens": 25126580.0, "step": 228 }, { "epoch": 1.6863468634686347, "grad_norm": 0.4704258168686015, "learning_rate": 9.554359905560887e-06, "loss": 0.8438, "num_tokens": 25246893.0, "step": 229 }, { "epoch": 1.6937269372693726, "grad_norm": 0.46206385798796146, "learning_rate": 9.549662300138328e-06, "loss": 0.8182, "num_tokens": 25353087.0, "step": 230 }, { "epoch": 1.7011070110701108, "grad_norm": 0.4972252613666411, "learning_rate": 9.54494136878121e-06, "loss": 0.8573, "num_tokens": 25483872.0, "step": 231 }, { "epoch": 1.7084870848708487, "grad_norm": 0.4414750299253366, "learning_rate": 9.540197138681954e-06, "loss": 0.7668, "num_tokens": 25585638.0, "step": 232 }, { "epoch": 1.7158671586715868, "grad_norm": 0.5554528041752966, "learning_rate": 9.53542963716719e-06, "loss": 1.0556, "num_tokens": 25723031.0, "step": 233 }, { "epoch": 1.7232472324723247, "grad_norm": 0.46030359376023355, "learning_rate": 9.53063889169758e-06, "loss": 0.915, "num_tokens": 25839271.0, "step": 234 }, { "epoch": 1.7306273062730626, "grad_norm": 0.5449032922287006, "learning_rate": 9.525824929867679e-06, "loss": 0.8693, "num_tokens": 25925327.0, "step": 235 }, { "epoch": 1.7380073800738007, "grad_norm": 0.4686019166095011, "learning_rate": 9.52098777940576e-06, "loss": 0.8072, "num_tokens": 26057236.0, "step": 236 }, { "epoch": 1.7453874538745389, "grad_norm": 0.4340016939308373, "learning_rate": 9.516127468173674e-06, "loss": 0.7962, "num_tokens": 26166087.0, "step": 237 }, { "epoch": 1.7527675276752768, "grad_norm": 0.5303575495741534, "learning_rate": 9.511244024166661e-06, "loss": 0.8571, "num_tokens": 26255272.0, "step": 238 }, { "epoch": 1.7601476014760147, "grad_norm": 0.5616358027384218, "learning_rate": 9.506337475513216e-06, "loss": 0.8681, "num_tokens": 26353080.0, "step": 239 }, { "epoch": 1.7675276752767528, "grad_norm": 0.5202222638637447, "learning_rate": 9.501407850474916e-06, "loss": 0.8949, "num_tokens": 26439204.0, "step": 240 }, { "epoch": 1.774907749077491, "grad_norm": 0.5609686292688814, "learning_rate": 9.496455177446252e-06, "loss": 0.8751, "num_tokens": 26560904.0, "step": 241 }, { "epoch": 1.7822878228782288, "grad_norm": 0.4613040830885771, "learning_rate": 9.491479484954475e-06, "loss": 0.869, "num_tokens": 26665635.0, "step": 242 }, { "epoch": 1.7896678966789668, "grad_norm": 0.5333172479366407, "learning_rate": 9.486480801659423e-06, "loss": 0.9239, "num_tokens": 26769223.0, "step": 243 }, { "epoch": 1.7970479704797047, "grad_norm": 0.4981884354859988, "learning_rate": 9.481459156353368e-06, "loss": 0.9425, "num_tokens": 26876616.0, "step": 244 }, { "epoch": 1.8044280442804428, "grad_norm": 0.4761851001292394, "learning_rate": 9.476414577960835e-06, "loss": 0.8853, "num_tokens": 26966877.0, "step": 245 }, { "epoch": 1.811808118081181, "grad_norm": 0.5918786962574727, "learning_rate": 9.471347095538448e-06, "loss": 0.836, "num_tokens": 27062067.0, "step": 246 }, { "epoch": 1.8191881918819188, "grad_norm": 0.47346571859338976, "learning_rate": 9.46625673827475e-06, "loss": 0.8121, "num_tokens": 27233761.0, "step": 247 }, { "epoch": 1.8265682656826567, "grad_norm": 0.43393941224602794, "learning_rate": 9.461143535490053e-06, "loss": 0.7822, "num_tokens": 27351746.0, "step": 248 }, { "epoch": 1.8339483394833949, "grad_norm": 0.4596856963327765, "learning_rate": 9.45600751663625e-06, "loss": 0.8125, "num_tokens": 27470068.0, "step": 249 }, { "epoch": 1.841328413284133, "grad_norm": 0.4623324180763746, "learning_rate": 9.45084871129666e-06, "loss": 0.8385, "num_tokens": 27587540.0, "step": 250 }, { "epoch": 1.848708487084871, "grad_norm": 0.4884311590737067, "learning_rate": 9.445667149185846e-06, "loss": 0.8584, "num_tokens": 27699030.0, "step": 251 }, { "epoch": 1.8560885608856088, "grad_norm": 0.5050542988903577, "learning_rate": 9.440462860149452e-06, "loss": 0.8695, "num_tokens": 27815973.0, "step": 252 }, { "epoch": 1.8634686346863467, "grad_norm": 0.4574188946697327, "learning_rate": 9.435235874164029e-06, "loss": 0.8468, "num_tokens": 27911876.0, "step": 253 }, { "epoch": 1.8708487084870848, "grad_norm": 0.5078878883639287, "learning_rate": 9.429986221336861e-06, "loss": 0.9666, "num_tokens": 28024905.0, "step": 254 }, { "epoch": 1.878228782287823, "grad_norm": 0.707997887987606, "learning_rate": 9.424713931905793e-06, "loss": 0.8271, "num_tokens": 28127192.0, "step": 255 }, { "epoch": 1.8856088560885609, "grad_norm": 0.5448729373134586, "learning_rate": 9.419419036239053e-06, "loss": 0.8077, "num_tokens": 28212550.0, "step": 256 }, { "epoch": 1.8929889298892988, "grad_norm": 0.46142556472216, "learning_rate": 9.414101564835086e-06, "loss": 0.7919, "num_tokens": 28352190.0, "step": 257 }, { "epoch": 1.900369003690037, "grad_norm": 0.4473495373569174, "learning_rate": 9.408761548322367e-06, "loss": 1.2457, "num_tokens": 28507538.0, "step": 258 }, { "epoch": 1.907749077490775, "grad_norm": 0.720865941907213, "learning_rate": 9.403399017459236e-06, "loss": 0.8916, "num_tokens": 28630324.0, "step": 259 }, { "epoch": 1.915129151291513, "grad_norm": 0.5405997607109805, "learning_rate": 9.398014003133704e-06, "loss": 0.8333, "num_tokens": 28720013.0, "step": 260 }, { "epoch": 1.9225092250922509, "grad_norm": 0.5243667632910098, "learning_rate": 9.392606536363304e-06, "loss": 0.8005, "num_tokens": 28850176.0, "step": 261 }, { "epoch": 1.9298892988929888, "grad_norm": 0.5120406103780775, "learning_rate": 9.387176648294874e-06, "loss": 0.8833, "num_tokens": 28945819.0, "step": 262 }, { "epoch": 1.937269372693727, "grad_norm": 0.46003493721185434, "learning_rate": 9.381724370204414e-06, "loss": 0.8105, "num_tokens": 29069004.0, "step": 263 }, { "epoch": 1.944649446494465, "grad_norm": 0.5105600061684051, "learning_rate": 9.376249733496882e-06, "loss": 0.852, "num_tokens": 29197272.0, "step": 264 }, { "epoch": 1.952029520295203, "grad_norm": 0.47786401535471007, "learning_rate": 9.370752769706024e-06, "loss": 0.8574, "num_tokens": 29291901.0, "step": 265 }, { "epoch": 1.9594095940959408, "grad_norm": 0.5006501587752776, "learning_rate": 9.365233510494186e-06, "loss": 0.8988, "num_tokens": 29407240.0, "step": 266 }, { "epoch": 1.966789667896679, "grad_norm": 0.5042822617717528, "learning_rate": 9.35969198765214e-06, "loss": 0.7268, "num_tokens": 29503115.0, "step": 267 }, { "epoch": 1.974169741697417, "grad_norm": 0.5223678106689094, "learning_rate": 9.354128233098889e-06, "loss": 0.907, "num_tokens": 29612199.0, "step": 268 }, { "epoch": 1.981549815498155, "grad_norm": 0.48980749434039345, "learning_rate": 9.348542278881497e-06, "loss": 0.9017, "num_tokens": 29711314.0, "step": 269 }, { "epoch": 1.988929889298893, "grad_norm": 0.5397711308358015, "learning_rate": 9.342934157174895e-06, "loss": 0.8368, "num_tokens": 29858282.0, "step": 270 }, { "epoch": 1.996309963099631, "grad_norm": 0.4018162467044364, "learning_rate": 9.337303900281693e-06, "loss": 0.7869, "num_tokens": 29975582.0, "step": 271 }, { "epoch": 2.0, "grad_norm": 0.7968766444431548, "learning_rate": 9.33165154063201e-06, "loss": 0.8073, "num_tokens": 30038824.0, "step": 272 }, { "epoch": 2.007380073800738, "grad_norm": 0.5215763819261514, "learning_rate": 9.325977110783264e-06, "loss": 0.7607, "num_tokens": 30161706.0, "step": 273 }, { "epoch": 2.014760147601476, "grad_norm": 0.4942048225579933, "learning_rate": 9.320280643420006e-06, "loss": 0.7386, "num_tokens": 30287957.0, "step": 274 }, { "epoch": 2.022140221402214, "grad_norm": 0.5113901907785446, "learning_rate": 9.314562171353717e-06, "loss": 0.7125, "num_tokens": 30397373.0, "step": 275 }, { "epoch": 2.029520295202952, "grad_norm": 0.5018401846949873, "learning_rate": 9.308821727522626e-06, "loss": 0.711, "num_tokens": 30493389.0, "step": 276 }, { "epoch": 2.03690036900369, "grad_norm": 0.5575170285871848, "learning_rate": 9.303059344991519e-06, "loss": 0.6267, "num_tokens": 30601222.0, "step": 277 }, { "epoch": 2.044280442804428, "grad_norm": 0.8923484894330787, "learning_rate": 9.297275056951551e-06, "loss": 0.7329, "num_tokens": 30705969.0, "step": 278 }, { "epoch": 2.0516605166051662, "grad_norm": 0.669629855434588, "learning_rate": 9.291468896720045e-06, "loss": 0.7182, "num_tokens": 30800590.0, "step": 279 }, { "epoch": 2.059040590405904, "grad_norm": 0.534127771451391, "learning_rate": 9.285640897740316e-06, "loss": 0.7151, "num_tokens": 30933354.0, "step": 280 }, { "epoch": 2.066420664206642, "grad_norm": 0.9857905397677859, "learning_rate": 9.279791093581461e-06, "loss": 0.9996, "num_tokens": 31059251.0, "step": 281 }, { "epoch": 2.07380073800738, "grad_norm": 0.4870382736213072, "learning_rate": 9.27391951793818e-06, "loss": 0.6776, "num_tokens": 31174572.0, "step": 282 }, { "epoch": 2.081180811808118, "grad_norm": 0.5506320918654295, "learning_rate": 9.268026204630574e-06, "loss": 0.6994, "num_tokens": 31274898.0, "step": 283 }, { "epoch": 2.088560885608856, "grad_norm": 0.5200515436580103, "learning_rate": 9.262111187603953e-06, "loss": 0.7309, "num_tokens": 31395801.0, "step": 284 }, { "epoch": 2.095940959409594, "grad_norm": 0.5513366405876152, "learning_rate": 9.25617450092864e-06, "loss": 0.6825, "num_tokens": 31490903.0, "step": 285 }, { "epoch": 2.103321033210332, "grad_norm": 0.5363819397451034, "learning_rate": 9.250216178799772e-06, "loss": 0.7638, "num_tokens": 31603887.0, "step": 286 }, { "epoch": 2.11070110701107, "grad_norm": 0.5423607210477918, "learning_rate": 9.244236255537108e-06, "loss": 0.699, "num_tokens": 31714116.0, "step": 287 }, { "epoch": 2.1180811808118083, "grad_norm": 0.5862396441020576, "learning_rate": 9.23823476558483e-06, "loss": 0.726, "num_tokens": 31809657.0, "step": 288 }, { "epoch": 2.125461254612546, "grad_norm": 0.6082666311294169, "learning_rate": 9.23221174351134e-06, "loss": 0.6569, "num_tokens": 31903064.0, "step": 289 }, { "epoch": 2.132841328413284, "grad_norm": 0.6670626481541114, "learning_rate": 9.226167224009065e-06, "loss": 0.735, "num_tokens": 31977936.0, "step": 290 }, { "epoch": 2.140221402214022, "grad_norm": 0.6340720630907666, "learning_rate": 9.220101241894262e-06, "loss": 0.7888, "num_tokens": 32096002.0, "step": 291 }, { "epoch": 2.14760147601476, "grad_norm": 0.5591180559329147, "learning_rate": 9.214013832106806e-06, "loss": 0.6877, "num_tokens": 32200262.0, "step": 292 }, { "epoch": 2.1549815498154983, "grad_norm": 0.5897306782384196, "learning_rate": 9.207905029709996e-06, "loss": 0.7687, "num_tokens": 32304732.0, "step": 293 }, { "epoch": 2.162361623616236, "grad_norm": 0.5283507246823136, "learning_rate": 9.201774869890351e-06, "loss": 0.664, "num_tokens": 32402049.0, "step": 294 }, { "epoch": 2.169741697416974, "grad_norm": 0.49182885764939005, "learning_rate": 9.195623387957412e-06, "loss": 0.6478, "num_tokens": 32528738.0, "step": 295 }, { "epoch": 2.177121771217712, "grad_norm": 0.5230520282166146, "learning_rate": 9.18945061934353e-06, "loss": 0.6481, "num_tokens": 32633734.0, "step": 296 }, { "epoch": 2.1845018450184504, "grad_norm": 0.505393942279067, "learning_rate": 9.183256599603672e-06, "loss": 0.7, "num_tokens": 32744593.0, "step": 297 }, { "epoch": 2.1918819188191883, "grad_norm": 0.6036841047133673, "learning_rate": 9.177041364415203e-06, "loss": 0.7305, "num_tokens": 32857772.0, "step": 298 }, { "epoch": 2.199261992619926, "grad_norm": 0.5464815698522134, "learning_rate": 9.170804949577698e-06, "loss": 0.7285, "num_tokens": 32950999.0, "step": 299 }, { "epoch": 2.206642066420664, "grad_norm": 0.586314392067132, "learning_rate": 9.16454739101272e-06, "loss": 0.6988, "num_tokens": 33034720.0, "step": 300 }, { "epoch": 2.2140221402214024, "grad_norm": 0.925180511120846, "learning_rate": 9.158268724763615e-06, "loss": 1.1771, "num_tokens": 33189582.0, "step": 301 }, { "epoch": 2.2214022140221403, "grad_norm": 0.5979161298544858, "learning_rate": 9.151968986995322e-06, "loss": 0.653, "num_tokens": 33348052.0, "step": 302 }, { "epoch": 2.2287822878228782, "grad_norm": 0.603845704491123, "learning_rate": 9.14564821399414e-06, "loss": 0.6487, "num_tokens": 33447540.0, "step": 303 }, { "epoch": 2.236162361623616, "grad_norm": 0.6191017874458337, "learning_rate": 9.139306442167533e-06, "loss": 0.7839, "num_tokens": 33534583.0, "step": 304 }, { "epoch": 2.243542435424354, "grad_norm": 0.5816464795523667, "learning_rate": 9.132943708043919e-06, "loss": 0.8154, "num_tokens": 33648628.0, "step": 305 }, { "epoch": 2.2509225092250924, "grad_norm": 0.5858083162521804, "learning_rate": 9.126560048272457e-06, "loss": 0.7596, "num_tokens": 33751895.0, "step": 306 }, { "epoch": 2.2583025830258303, "grad_norm": 0.5260445329669106, "learning_rate": 9.12015549962284e-06, "loss": 0.7344, "num_tokens": 33878759.0, "step": 307 }, { "epoch": 2.265682656826568, "grad_norm": 0.5328592013038843, "learning_rate": 9.113730098985076e-06, "loss": 0.6032, "num_tokens": 33986222.0, "step": 308 }, { "epoch": 2.273062730627306, "grad_norm": 0.5404895444418575, "learning_rate": 9.10728388336928e-06, "loss": 0.6752, "num_tokens": 34084937.0, "step": 309 }, { "epoch": 2.280442804428044, "grad_norm": 0.5165834237491866, "learning_rate": 9.100816889905465e-06, "loss": 0.6338, "num_tokens": 34203999.0, "step": 310 }, { "epoch": 2.2878228782287824, "grad_norm": 0.5079482408380165, "learning_rate": 9.094329155843323e-06, "loss": 0.6519, "num_tokens": 34327452.0, "step": 311 }, { "epoch": 2.2952029520295203, "grad_norm": 0.558757746271742, "learning_rate": 9.087820718552006e-06, "loss": 0.711, "num_tokens": 34424365.0, "step": 312 }, { "epoch": 2.302583025830258, "grad_norm": 0.56955775758271, "learning_rate": 9.081291615519921e-06, "loss": 0.6588, "num_tokens": 34519556.0, "step": 313 }, { "epoch": 2.3099630996309966, "grad_norm": 0.5571316363879791, "learning_rate": 9.074741884354507e-06, "loss": 0.7381, "num_tokens": 34624816.0, "step": 314 }, { "epoch": 2.3173431734317345, "grad_norm": 0.5525816487649007, "learning_rate": 9.068171562782022e-06, "loss": 0.6969, "num_tokens": 34734817.0, "step": 315 }, { "epoch": 2.3247232472324724, "grad_norm": 0.6353130260407771, "learning_rate": 9.061580688647322e-06, "loss": 0.7271, "num_tokens": 34857095.0, "step": 316 }, { "epoch": 2.3321033210332103, "grad_norm": 0.5044947960776338, "learning_rate": 9.054969299913646e-06, "loss": 0.6678, "num_tokens": 34995390.0, "step": 317 }, { "epoch": 2.339483394833948, "grad_norm": 0.5037735213732109, "learning_rate": 9.048337434662398e-06, "loss": 0.6462, "num_tokens": 35122926.0, "step": 318 }, { "epoch": 2.3468634686346865, "grad_norm": 0.540258518479221, "learning_rate": 9.041685131092925e-06, "loss": 0.6824, "num_tokens": 35233600.0, "step": 319 }, { "epoch": 2.3542435424354244, "grad_norm": 0.508870393111074, "learning_rate": 9.035012427522296e-06, "loss": 0.6109, "num_tokens": 35336155.0, "step": 320 }, { "epoch": 2.3616236162361623, "grad_norm": 0.5095273169618749, "learning_rate": 9.028319362385088e-06, "loss": 0.6582, "num_tokens": 35455988.0, "step": 321 }, { "epoch": 2.3690036900369003, "grad_norm": 0.5901762579618401, "learning_rate": 9.021605974233153e-06, "loss": 0.6647, "num_tokens": 35538087.0, "step": 322 }, { "epoch": 2.376383763837638, "grad_norm": 0.49466058873563995, "learning_rate": 9.014872301735412e-06, "loss": 0.6884, "num_tokens": 35705951.0, "step": 323 }, { "epoch": 2.3837638376383765, "grad_norm": 0.5962756574638172, "learning_rate": 9.008118383677618e-06, "loss": 0.7109, "num_tokens": 35807170.0, "step": 324 }, { "epoch": 2.3911439114391144, "grad_norm": 0.465779093896106, "learning_rate": 9.001344258962134e-06, "loss": 0.6042, "num_tokens": 35925122.0, "step": 325 }, { "epoch": 2.3985239852398523, "grad_norm": 0.5839163750401734, "learning_rate": 8.994549966607723e-06, "loss": 0.6991, "num_tokens": 36014729.0, "step": 326 }, { "epoch": 2.4059040590405902, "grad_norm": 0.5124430350072646, "learning_rate": 8.987735545749304e-06, "loss": 0.6827, "num_tokens": 36139300.0, "step": 327 }, { "epoch": 2.4132841328413286, "grad_norm": 0.5661550803778267, "learning_rate": 8.980901035637739e-06, "loss": 0.7175, "num_tokens": 36235122.0, "step": 328 }, { "epoch": 2.4206642066420665, "grad_norm": 0.5589617880405421, "learning_rate": 8.974046475639605e-06, "loss": 0.6639, "num_tokens": 36342683.0, "step": 329 }, { "epoch": 2.4280442804428044, "grad_norm": 0.4908227738995962, "learning_rate": 8.96717190523696e-06, "loss": 0.6748, "num_tokens": 36457622.0, "step": 330 }, { "epoch": 2.4354243542435423, "grad_norm": 0.578544996079008, "learning_rate": 8.96027736402713e-06, "loss": 0.6712, "num_tokens": 36550707.0, "step": 331 }, { "epoch": 2.4428044280442807, "grad_norm": 0.5474890587680412, "learning_rate": 8.953362891722464e-06, "loss": 0.6869, "num_tokens": 36669279.0, "step": 332 }, { "epoch": 2.4501845018450186, "grad_norm": 0.4724321116407533, "learning_rate": 8.94642852815012e-06, "loss": 0.5751, "num_tokens": 36793295.0, "step": 333 }, { "epoch": 2.4575645756457565, "grad_norm": 0.606460496382911, "learning_rate": 8.939474313251824e-06, "loss": 0.7516, "num_tokens": 36877447.0, "step": 334 }, { "epoch": 2.4649446494464944, "grad_norm": 0.6045532758033616, "learning_rate": 8.932500287083647e-06, "loss": 0.773, "num_tokens": 36982748.0, "step": 335 }, { "epoch": 2.4723247232472323, "grad_norm": 0.5065444389875737, "learning_rate": 8.925506489815773e-06, "loss": 0.636, "num_tokens": 37094162.0, "step": 336 }, { "epoch": 2.4797047970479706, "grad_norm": 0.5799022450280746, "learning_rate": 8.918492961732268e-06, "loss": 0.7163, "num_tokens": 37203237.0, "step": 337 }, { "epoch": 2.4870848708487086, "grad_norm": 0.49720680793473937, "learning_rate": 8.911459743230844e-06, "loss": 0.6905, "num_tokens": 37327724.0, "step": 338 }, { "epoch": 2.4944649446494465, "grad_norm": 0.5621911034313891, "learning_rate": 8.904406874822633e-06, "loss": 0.7011, "num_tokens": 37422462.0, "step": 339 }, { "epoch": 2.5018450184501844, "grad_norm": 0.5993381292011397, "learning_rate": 8.897334397131945e-06, "loss": 0.745, "num_tokens": 37527002.0, "step": 340 }, { "epoch": 2.5092250922509223, "grad_norm": 0.5524534340458445, "learning_rate": 8.89024235089604e-06, "loss": 0.6314, "num_tokens": 37644410.0, "step": 341 }, { "epoch": 2.5166051660516606, "grad_norm": 0.5968119241632962, "learning_rate": 8.883130776964896e-06, "loss": 0.6736, "num_tokens": 37740096.0, "step": 342 }, { "epoch": 2.5239852398523985, "grad_norm": 0.5653506959798167, "learning_rate": 8.875999716300969e-06, "loss": 0.6653, "num_tokens": 37829867.0, "step": 343 }, { "epoch": 2.5313653136531364, "grad_norm": 0.5699516231964283, "learning_rate": 8.868849209978954e-06, "loss": 0.6591, "num_tokens": 37924329.0, "step": 344 }, { "epoch": 2.538745387453875, "grad_norm": 0.5710685664535915, "learning_rate": 8.861679299185557e-06, "loss": 0.7203, "num_tokens": 38058678.0, "step": 345 }, { "epoch": 2.5461254612546127, "grad_norm": 0.4616390155799769, "learning_rate": 8.85449002521925e-06, "loss": 0.6561, "num_tokens": 38202792.0, "step": 346 }, { "epoch": 2.5535055350553506, "grad_norm": 0.5849649802018279, "learning_rate": 8.847281429490037e-06, "loss": 0.6766, "num_tokens": 38306259.0, "step": 347 }, { "epoch": 2.5608856088560885, "grad_norm": 0.5386758574010692, "learning_rate": 8.840053553519216e-06, "loss": 0.6315, "num_tokens": 38413647.0, "step": 348 }, { "epoch": 2.5682656826568264, "grad_norm": 0.49170936889455386, "learning_rate": 8.832806438939137e-06, "loss": 0.6465, "num_tokens": 38544977.0, "step": 349 }, { "epoch": 2.5756457564575648, "grad_norm": 0.5775622051020965, "learning_rate": 8.825540127492966e-06, "loss": 0.7501, "num_tokens": 38650617.0, "step": 350 }, { "epoch": 2.5830258302583027, "grad_norm": 0.4843732761430568, "learning_rate": 8.818254661034442e-06, "loss": 0.6355, "num_tokens": 38821487.0, "step": 351 }, { "epoch": 2.5904059040590406, "grad_norm": 0.5647579621437209, "learning_rate": 8.810950081527633e-06, "loss": 0.7156, "num_tokens": 38947813.0, "step": 352 }, { "epoch": 2.5977859778597785, "grad_norm": 0.5087818689310419, "learning_rate": 8.803626431046703e-06, "loss": 0.6901, "num_tokens": 39061069.0, "step": 353 }, { "epoch": 2.6051660516605164, "grad_norm": 0.5037922071125235, "learning_rate": 8.796283751775657e-06, "loss": 0.6543, "num_tokens": 39188302.0, "step": 354 }, { "epoch": 2.6125461254612548, "grad_norm": 1.0060863716469244, "learning_rate": 8.78892208600811e-06, "loss": 0.7447, "num_tokens": 39283955.0, "step": 355 }, { "epoch": 2.6199261992619927, "grad_norm": 0.5341956449839068, "learning_rate": 8.781541476147043e-06, "loss": 0.6923, "num_tokens": 39401219.0, "step": 356 }, { "epoch": 2.6273062730627306, "grad_norm": 3.805500375286104, "learning_rate": 8.774141964704547e-06, "loss": 0.5917, "num_tokens": 39537217.0, "step": 357 }, { "epoch": 2.6346863468634685, "grad_norm": 0.5309380227267634, "learning_rate": 8.766723594301585e-06, "loss": 0.7479, "num_tokens": 39655525.0, "step": 358 }, { "epoch": 2.6420664206642064, "grad_norm": 0.54834990506598, "learning_rate": 8.759286407667755e-06, "loss": 0.7124, "num_tokens": 39753518.0, "step": 359 }, { "epoch": 2.6494464944649447, "grad_norm": 0.5247565829789032, "learning_rate": 8.751830447641028e-06, "loss": 0.6959, "num_tokens": 39872001.0, "step": 360 }, { "epoch": 2.6568265682656826, "grad_norm": 0.5528126055251835, "learning_rate": 8.744355757167513e-06, "loss": 0.7219, "num_tokens": 39973085.0, "step": 361 }, { "epoch": 2.6642066420664205, "grad_norm": 0.7178953596915454, "learning_rate": 8.736862379301205e-06, "loss": 1.1287, "num_tokens": 40117640.0, "step": 362 }, { "epoch": 2.671586715867159, "grad_norm": 0.5590814924293989, "learning_rate": 8.72935035720374e-06, "loss": 0.6442, "num_tokens": 40243111.0, "step": 363 }, { "epoch": 2.678966789667897, "grad_norm": 0.5858386874871219, "learning_rate": 8.721819734144137e-06, "loss": 0.7624, "num_tokens": 40340644.0, "step": 364 }, { "epoch": 2.6863468634686347, "grad_norm": 0.5439140593105536, "learning_rate": 8.714270553498567e-06, "loss": 0.8634, "num_tokens": 40459017.0, "step": 365 }, { "epoch": 2.6937269372693726, "grad_norm": 0.5696734249674251, "learning_rate": 8.706702858750084e-06, "loss": 0.6883, "num_tokens": 40567170.0, "step": 366 }, { "epoch": 2.7011070110701105, "grad_norm": 0.540221339754704, "learning_rate": 8.699116693488383e-06, "loss": 0.6701, "num_tokens": 40689551.0, "step": 367 }, { "epoch": 2.708487084870849, "grad_norm": 0.5268135413989117, "learning_rate": 8.691512101409553e-06, "loss": 0.7441, "num_tokens": 40808209.0, "step": 368 }, { "epoch": 2.715867158671587, "grad_norm": 0.5779813810874505, "learning_rate": 8.68388912631582e-06, "loss": 0.6434, "num_tokens": 40920204.0, "step": 369 }, { "epoch": 2.7232472324723247, "grad_norm": 0.5715358388081127, "learning_rate": 8.676247812115288e-06, "loss": 1.0132, "num_tokens": 41052491.0, "step": 370 }, { "epoch": 2.7306273062730626, "grad_norm": 0.5631079744811832, "learning_rate": 8.668588202821708e-06, "loss": 0.6923, "num_tokens": 41148568.0, "step": 371 }, { "epoch": 2.7380073800738005, "grad_norm": 0.6059248747102125, "learning_rate": 8.660910342554194e-06, "loss": 0.6413, "num_tokens": 41229823.0, "step": 372 }, { "epoch": 2.745387453874539, "grad_norm": 0.5568122522506863, "learning_rate": 8.653214275537e-06, "loss": 0.7091, "num_tokens": 41350523.0, "step": 373 }, { "epoch": 2.7527675276752768, "grad_norm": 0.5156740799977944, "learning_rate": 8.645500046099237e-06, "loss": 0.6046, "num_tokens": 41461892.0, "step": 374 }, { "epoch": 2.7601476014760147, "grad_norm": 0.5328035035712879, "learning_rate": 8.637767698674642e-06, "loss": 0.7124, "num_tokens": 41599826.0, "step": 375 }, { "epoch": 2.767527675276753, "grad_norm": 0.6395151238615233, "learning_rate": 8.630017277801306e-06, "loss": 0.6799, "num_tokens": 41684056.0, "step": 376 }, { "epoch": 2.774907749077491, "grad_norm": 0.5718677138308345, "learning_rate": 8.62224882812142e-06, "loss": 0.6698, "num_tokens": 41789363.0, "step": 377 }, { "epoch": 2.782287822878229, "grad_norm": 0.5745655870409727, "learning_rate": 8.614462394381028e-06, "loss": 0.6536, "num_tokens": 41902328.0, "step": 378 }, { "epoch": 2.7896678966789668, "grad_norm": 0.5962600314891544, "learning_rate": 8.606658021429754e-06, "loss": 0.7095, "num_tokens": 42013036.0, "step": 379 }, { "epoch": 2.7970479704797047, "grad_norm": 0.589009128309455, "learning_rate": 8.598835754220554e-06, "loss": 0.7223, "num_tokens": 42142939.0, "step": 380 }, { "epoch": 2.804428044280443, "grad_norm": 0.6135753550691891, "learning_rate": 8.590995637809459e-06, "loss": 0.6928, "num_tokens": 42225463.0, "step": 381 }, { "epoch": 2.811808118081181, "grad_norm": 0.6250756573123484, "learning_rate": 8.5831377173553e-06, "loss": 0.7181, "num_tokens": 42307416.0, "step": 382 }, { "epoch": 2.819188191881919, "grad_norm": 0.5798107659221702, "learning_rate": 8.575262038119468e-06, "loss": 0.8076, "num_tokens": 42427179.0, "step": 383 }, { "epoch": 2.8265682656826567, "grad_norm": 0.531136083790205, "learning_rate": 8.567368645465646e-06, "loss": 0.6446, "num_tokens": 42530634.0, "step": 384 }, { "epoch": 2.8339483394833946, "grad_norm": 0.5527830503779945, "learning_rate": 8.559457584859537e-06, "loss": 0.7242, "num_tokens": 42637999.0, "step": 385 }, { "epoch": 2.841328413284133, "grad_norm": 0.5614225280721861, "learning_rate": 8.551528901868614e-06, "loss": 0.6411, "num_tokens": 42728403.0, "step": 386 }, { "epoch": 2.848708487084871, "grad_norm": 0.5346554017745229, "learning_rate": 8.543582642161857e-06, "loss": 0.6717, "num_tokens": 42838803.0, "step": 387 }, { "epoch": 2.856088560885609, "grad_norm": 0.6029067731047304, "learning_rate": 8.535618851509487e-06, "loss": 0.7271, "num_tokens": 42952058.0, "step": 388 }, { "epoch": 2.8634686346863467, "grad_norm": 0.5218810595386072, "learning_rate": 8.5276375757827e-06, "loss": 0.5411, "num_tokens": 43052733.0, "step": 389 }, { "epoch": 2.8708487084870846, "grad_norm": 0.6781573031807308, "learning_rate": 8.519638860953408e-06, "loss": 0.747, "num_tokens": 43127428.0, "step": 390 }, { "epoch": 2.878228782287823, "grad_norm": 0.5474215701215952, "learning_rate": 8.511622753093971e-06, "loss": 0.6845, "num_tokens": 43245146.0, "step": 391 }, { "epoch": 2.885608856088561, "grad_norm": 0.5828226303070865, "learning_rate": 8.503589298376931e-06, "loss": 0.6714, "num_tokens": 43343029.0, "step": 392 }, { "epoch": 2.892988929889299, "grad_norm": 0.5566542613181935, "learning_rate": 8.49553854307475e-06, "loss": 0.7581, "num_tokens": 43449381.0, "step": 393 }, { "epoch": 2.900369003690037, "grad_norm": 0.5143312406285538, "learning_rate": 8.48747053355954e-06, "loss": 0.7023, "num_tokens": 43605262.0, "step": 394 }, { "epoch": 2.907749077490775, "grad_norm": 0.5796296239031932, "learning_rate": 8.479385316302793e-06, "loss": 0.6038, "num_tokens": 43695066.0, "step": 395 }, { "epoch": 2.915129151291513, "grad_norm": 0.5717544123390733, "learning_rate": 8.47128293787512e-06, "loss": 0.632, "num_tokens": 43808970.0, "step": 396 }, { "epoch": 2.922509225092251, "grad_norm": 0.5870739135203897, "learning_rate": 8.463163444945986e-06, "loss": 0.7216, "num_tokens": 43933194.0, "step": 397 }, { "epoch": 2.9298892988929888, "grad_norm": 0.5409023547367442, "learning_rate": 8.455026884283424e-06, "loss": 0.65, "num_tokens": 44043681.0, "step": 398 }, { "epoch": 2.937269372693727, "grad_norm": 0.5728886078954685, "learning_rate": 8.446873302753783e-06, "loss": 0.756, "num_tokens": 44162249.0, "step": 399 }, { "epoch": 2.944649446494465, "grad_norm": 0.5573913061191518, "learning_rate": 8.43870274732145e-06, "loss": 0.7583, "num_tokens": 44275749.0, "step": 400 }, { "epoch": 2.952029520295203, "grad_norm": 0.6759949052252414, "learning_rate": 8.430515265048584e-06, "loss": 0.7366, "num_tokens": 44363141.0, "step": 401 }, { "epoch": 2.959409594095941, "grad_norm": 0.5658923076568075, "learning_rate": 8.422310903094836e-06, "loss": 0.7266, "num_tokens": 44460736.0, "step": 402 }, { "epoch": 2.9667896678966788, "grad_norm": 0.5534862140926422, "learning_rate": 8.41408970871709e-06, "loss": 0.6982, "num_tokens": 44555897.0, "step": 403 }, { "epoch": 2.974169741697417, "grad_norm": 0.5525386920550562, "learning_rate": 8.405851729269179e-06, "loss": 0.6984, "num_tokens": 44660818.0, "step": 404 }, { "epoch": 2.981549815498155, "grad_norm": 0.5467576384243312, "learning_rate": 8.39759701220162e-06, "loss": 0.7544, "num_tokens": 44775293.0, "step": 405 }, { "epoch": 2.988929889298893, "grad_norm": 0.5762236970584276, "learning_rate": 8.389325605061343e-06, "loss": 0.7656, "num_tokens": 44895620.0, "step": 406 }, { "epoch": 2.9963099630996313, "grad_norm": 0.5415184898333844, "learning_rate": 8.381037555491401e-06, "loss": 0.7289, "num_tokens": 45008883.0, "step": 407 }, { "epoch": 3.0, "grad_norm": 0.5415184898333844, "learning_rate": 8.372732911230717e-06, "loss": 0.742, "num_tokens": 45056415.0, "step": 408 }, { "epoch": 3.007380073800738, "grad_norm": 1.0671420058937142, "learning_rate": 8.364411720113794e-06, "loss": 0.6045, "num_tokens": 45189762.0, "step": 409 }, { "epoch": 3.014760147601476, "grad_norm": 0.7014142965782201, "learning_rate": 8.356074030070447e-06, "loss": 0.5984, "num_tokens": 45301171.0, "step": 410 }, { "epoch": 3.022140221402214, "grad_norm": 0.5815507535960692, "learning_rate": 8.347719889125521e-06, "loss": 0.5508, "num_tokens": 45408595.0, "step": 411 }, { "epoch": 3.029520295202952, "grad_norm": 0.6209554212604178, "learning_rate": 8.339349345398622e-06, "loss": 0.559, "num_tokens": 45534003.0, "step": 412 }, { "epoch": 3.03690036900369, "grad_norm": 0.5657155409020301, "learning_rate": 8.33096244710383e-06, "loss": 0.5474, "num_tokens": 45687169.0, "step": 413 }, { "epoch": 3.044280442804428, "grad_norm": 0.7085556124145286, "learning_rate": 8.322559242549435e-06, "loss": 0.5636, "num_tokens": 45783210.0, "step": 414 }, { "epoch": 3.0516605166051662, "grad_norm": 0.8664457929036407, "learning_rate": 8.31413978013764e-06, "loss": 0.4727, "num_tokens": 45880811.0, "step": 415 }, { "epoch": 3.059040590405904, "grad_norm": 0.7915254557523631, "learning_rate": 8.305704108364301e-06, "loss": 0.5057, "num_tokens": 46016909.0, "step": 416 }, { "epoch": 3.066420664206642, "grad_norm": 0.6904619869684991, "learning_rate": 8.297252275818639e-06, "loss": 0.5403, "num_tokens": 46154743.0, "step": 417 }, { "epoch": 3.07380073800738, "grad_norm": 0.6704294101177389, "learning_rate": 8.288784331182954e-06, "loss": 0.5402, "num_tokens": 46253315.0, "step": 418 }, { "epoch": 3.081180811808118, "grad_norm": 0.7803874824846486, "learning_rate": 8.280300323232361e-06, "loss": 1.0731, "num_tokens": 46386831.0, "step": 419 }, { "epoch": 3.088560885608856, "grad_norm": 0.6204587386033913, "learning_rate": 8.271800300834488e-06, "loss": 0.5226, "num_tokens": 46486510.0, "step": 420 }, { "epoch": 3.095940959409594, "grad_norm": 0.6805498120059815, "learning_rate": 8.263284312949215e-06, "loss": 0.5697, "num_tokens": 46567527.0, "step": 421 }, { "epoch": 3.103321033210332, "grad_norm": 0.6085858333682164, "learning_rate": 8.254752408628378e-06, "loss": 0.5771, "num_tokens": 46696494.0, "step": 422 }, { "epoch": 3.11070110701107, "grad_norm": 0.5450373938151896, "learning_rate": 8.246204637015494e-06, "loss": 0.5062, "num_tokens": 46810646.0, "step": 423 }, { "epoch": 3.1180811808118083, "grad_norm": 0.5420317314750567, "learning_rate": 8.237641047345473e-06, "loss": 0.5446, "num_tokens": 46928998.0, "step": 424 }, { "epoch": 3.125461254612546, "grad_norm": 0.7244311467151333, "learning_rate": 8.229061688944335e-06, "loss": 0.5372, "num_tokens": 47023410.0, "step": 425 }, { "epoch": 3.132841328413284, "grad_norm": 0.610314609252109, "learning_rate": 8.220466611228931e-06, "loss": 0.4839, "num_tokens": 47154761.0, "step": 426 }, { "epoch": 3.140221402214022, "grad_norm": 0.8006612716945156, "learning_rate": 8.211855863706654e-06, "loss": 0.5217, "num_tokens": 47270799.0, "step": 427 }, { "epoch": 3.14760147601476, "grad_norm": 0.592305967306938, "learning_rate": 8.203229495975154e-06, "loss": 0.4979, "num_tokens": 47451649.0, "step": 428 }, { "epoch": 3.1549815498154983, "grad_norm": 0.5474418113355926, "learning_rate": 8.194587557722053e-06, "loss": 0.5136, "num_tokens": 47551002.0, "step": 429 }, { "epoch": 3.162361623616236, "grad_norm": 0.6660539332965522, "learning_rate": 8.185930098724657e-06, "loss": 0.5439, "num_tokens": 47654318.0, "step": 430 }, { "epoch": 3.169741697416974, "grad_norm": 0.6382441671849843, "learning_rate": 8.177257168849673e-06, "loss": 0.5595, "num_tokens": 47797604.0, "step": 431 }, { "epoch": 3.177121771217712, "grad_norm": 0.5849382339843232, "learning_rate": 8.168568818052924e-06, "loss": 0.5581, "num_tokens": 47917575.0, "step": 432 }, { "epoch": 3.1845018450184504, "grad_norm": 0.5770792253720011, "learning_rate": 8.159865096379046e-06, "loss": 0.4999, "num_tokens": 47993898.0, "step": 433 }, { "epoch": 3.1918819188191883, "grad_norm": 0.6995519838702828, "learning_rate": 8.151146053961218e-06, "loss": 0.5878, "num_tokens": 48111580.0, "step": 434 }, { "epoch": 3.199261992619926, "grad_norm": 0.7163630765514822, "learning_rate": 8.142411741020872e-06, "loss": 0.4991, "num_tokens": 48200358.0, "step": 435 }, { "epoch": 3.206642066420664, "grad_norm": 0.6392968082830581, "learning_rate": 8.133662207867383e-06, "loss": 0.5478, "num_tokens": 48294625.0, "step": 436 }, { "epoch": 3.2140221402214024, "grad_norm": 0.6354099390161857, "learning_rate": 8.124897504897806e-06, "loss": 0.746, "num_tokens": 48395339.0, "step": 437 }, { "epoch": 3.2214022140221403, "grad_norm": 0.8664104426738044, "learning_rate": 8.116117682596571e-06, "loss": 0.6143, "num_tokens": 48480560.0, "step": 438 }, { "epoch": 3.2287822878228782, "grad_norm": 0.7067089339309491, "learning_rate": 8.10732279153519e-06, "loss": 0.5595, "num_tokens": 48605710.0, "step": 439 }, { "epoch": 3.236162361623616, "grad_norm": 0.562171737471794, "learning_rate": 8.098512882371977e-06, "loss": 0.5264, "num_tokens": 48734584.0, "step": 440 }, { "epoch": 3.243542435424354, "grad_norm": 0.552023532088518, "learning_rate": 8.089688005851746e-06, "loss": 0.4797, "num_tokens": 48879534.0, "step": 441 }, { "epoch": 3.2509225092250924, "grad_norm": 0.5902870119743631, "learning_rate": 8.080848212805526e-06, "loss": 0.5482, "num_tokens": 48983752.0, "step": 442 }, { "epoch": 3.2583025830258303, "grad_norm": 0.6035750241527118, "learning_rate": 8.071993554150258e-06, "loss": 0.535, "num_tokens": 49097804.0, "step": 443 }, { "epoch": 3.265682656826568, "grad_norm": 0.7048045755523692, "learning_rate": 8.063124080888514e-06, "loss": 0.513, "num_tokens": 49212376.0, "step": 444 }, { "epoch": 3.273062730627306, "grad_norm": 0.6574564335668868, "learning_rate": 8.0542398441082e-06, "loss": 0.5927, "num_tokens": 49330960.0, "step": 445 }, { "epoch": 3.280442804428044, "grad_norm": 0.663758732630871, "learning_rate": 8.045340894982254e-06, "loss": 0.5954, "num_tokens": 49429078.0, "step": 446 }, { "epoch": 3.2878228782287824, "grad_norm": 0.6512328570912242, "learning_rate": 8.036427284768357e-06, "loss": 0.5032, "num_tokens": 49541765.0, "step": 447 }, { "epoch": 3.2952029520295203, "grad_norm": 0.5798248571379592, "learning_rate": 8.027499064808642e-06, "loss": 0.4804, "num_tokens": 49653291.0, "step": 448 }, { "epoch": 3.302583025830258, "grad_norm": 0.614467290698762, "learning_rate": 8.018556286529387e-06, "loss": 0.5653, "num_tokens": 49761203.0, "step": 449 }, { "epoch": 3.3099630996309966, "grad_norm": 0.5686386180748071, "learning_rate": 8.009599001440733e-06, "loss": 0.4794, "num_tokens": 49912863.0, "step": 450 }, { "epoch": 3.3173431734317345, "grad_norm": 0.6032697718012249, "learning_rate": 8.000627261136375e-06, "loss": 0.5175, "num_tokens": 50011025.0, "step": 451 }, { "epoch": 3.3247232472324724, "grad_norm": 0.6065988231357294, "learning_rate": 7.991641117293267e-06, "loss": 0.5121, "num_tokens": 50132098.0, "step": 452 }, { "epoch": 3.3321033210332103, "grad_norm": 0.6562165232005108, "learning_rate": 7.982640621671336e-06, "loss": 0.6779, "num_tokens": 50220571.0, "step": 453 }, { "epoch": 3.339483394833948, "grad_norm": 0.6967924204298649, "learning_rate": 7.973625826113167e-06, "loss": 0.5329, "num_tokens": 50350796.0, "step": 454 }, { "epoch": 3.3468634686346865, "grad_norm": 0.5798961290769074, "learning_rate": 7.964596782543717e-06, "loss": 0.6361, "num_tokens": 50460299.0, "step": 455 }, { "epoch": 3.3542435424354244, "grad_norm": 0.5621999963338501, "learning_rate": 7.955553542970003e-06, "loss": 0.5423, "num_tokens": 50585790.0, "step": 456 }, { "epoch": 3.3616236162361623, "grad_norm": 0.635123869146959, "learning_rate": 7.94649615948082e-06, "loss": 0.5394, "num_tokens": 50682415.0, "step": 457 }, { "epoch": 3.3690036900369003, "grad_norm": 0.5943185984471594, "learning_rate": 7.93742468424643e-06, "loss": 0.4831, "num_tokens": 50782466.0, "step": 458 }, { "epoch": 3.376383763837638, "grad_norm": 0.6890091213779607, "learning_rate": 7.928339169518257e-06, "loss": 0.5852, "num_tokens": 50883210.0, "step": 459 }, { "epoch": 3.3837638376383765, "grad_norm": 0.6414354230918646, "learning_rate": 7.9192396676286e-06, "loss": 0.5412, "num_tokens": 50988218.0, "step": 460 }, { "epoch": 3.3911439114391144, "grad_norm": 0.6566503833612184, "learning_rate": 7.910126230990313e-06, "loss": 0.5438, "num_tokens": 51090015.0, "step": 461 }, { "epoch": 3.3985239852398523, "grad_norm": 0.6376977950559205, "learning_rate": 7.900998912096528e-06, "loss": 0.5139, "num_tokens": 51214982.0, "step": 462 }, { "epoch": 3.4059040590405902, "grad_norm": 0.6764222073349121, "learning_rate": 7.891857763520327e-06, "loss": 0.5585, "num_tokens": 51325827.0, "step": 463 }, { "epoch": 3.4132841328413286, "grad_norm": 0.600654311601057, "learning_rate": 7.882702837914455e-06, "loss": 0.5435, "num_tokens": 51450148.0, "step": 464 }, { "epoch": 3.4206642066420665, "grad_norm": 0.6744841627045786, "learning_rate": 7.873534188011009e-06, "loss": 0.5234, "num_tokens": 51543000.0, "step": 465 }, { "epoch": 3.4280442804428044, "grad_norm": 0.6170707900142415, "learning_rate": 7.864351866621143e-06, "loss": 0.6331, "num_tokens": 51649095.0, "step": 466 }, { "epoch": 3.4354243542435423, "grad_norm": 0.6536867528779, "learning_rate": 7.855155926634755e-06, "loss": 0.5194, "num_tokens": 51747999.0, "step": 467 }, { "epoch": 3.4428044280442807, "grad_norm": 0.6685792006676133, "learning_rate": 7.845946421020186e-06, "loss": 0.4761, "num_tokens": 51840725.0, "step": 468 }, { "epoch": 3.4501845018450186, "grad_norm": 0.583372186794409, "learning_rate": 7.836723402823913e-06, "loss": 0.5723, "num_tokens": 51938008.0, "step": 469 }, { "epoch": 3.4575645756457565, "grad_norm": 0.6457686614718543, "learning_rate": 7.82748692517025e-06, "loss": 0.5624, "num_tokens": 52027007.0, "step": 470 }, { "epoch": 3.4649446494464944, "grad_norm": 0.6192253028170305, "learning_rate": 7.818237041261032e-06, "loss": 0.5552, "num_tokens": 52148401.0, "step": 471 }, { "epoch": 3.4723247232472323, "grad_norm": 0.6157880744622222, "learning_rate": 7.808973804375318e-06, "loss": 0.5367, "num_tokens": 52251924.0, "step": 472 }, { "epoch": 3.4797047970479706, "grad_norm": 0.585562845678871, "learning_rate": 7.799697267869073e-06, "loss": 0.5716, "num_tokens": 52364563.0, "step": 473 }, { "epoch": 3.4870848708487086, "grad_norm": 0.5810620097720413, "learning_rate": 7.790407485174873e-06, "loss": 0.4792, "num_tokens": 52491358.0, "step": 474 }, { "epoch": 3.4944649446494465, "grad_norm": 0.7063772145482928, "learning_rate": 7.781104509801594e-06, "loss": 0.5479, "num_tokens": 52581166.0, "step": 475 }, { "epoch": 3.5018450184501844, "grad_norm": 0.6787021375520482, "learning_rate": 7.771788395334096e-06, "loss": 0.5712, "num_tokens": 52703377.0, "step": 476 }, { "epoch": 3.5092250922509223, "grad_norm": 0.6608323171116782, "learning_rate": 7.762459195432917e-06, "loss": 0.4981, "num_tokens": 52823591.0, "step": 477 }, { "epoch": 3.5166051660516606, "grad_norm": 0.6684994354546132, "learning_rate": 7.753116963833977e-06, "loss": 0.5228, "num_tokens": 52922716.0, "step": 478 }, { "epoch": 3.5239852398523985, "grad_norm": 0.685837014810044, "learning_rate": 7.74376175434825e-06, "loss": 0.5645, "num_tokens": 53027206.0, "step": 479 }, { "epoch": 3.5313653136531364, "grad_norm": 0.533647870419083, "learning_rate": 7.734393620861467e-06, "loss": 0.4529, "num_tokens": 53147758.0, "step": 480 }, { "epoch": 3.538745387453875, "grad_norm": 0.6233894301978303, "learning_rate": 7.725012617333796e-06, "loss": 0.5843, "num_tokens": 53260966.0, "step": 481 }, { "epoch": 3.5461254612546127, "grad_norm": 0.7704460497456811, "learning_rate": 7.71561879779954e-06, "loss": 0.533, "num_tokens": 53348564.0, "step": 482 }, { "epoch": 3.5535055350553506, "grad_norm": 0.5821959035028632, "learning_rate": 7.706212216366821e-06, "loss": 0.4733, "num_tokens": 53502326.0, "step": 483 }, { "epoch": 3.5608856088560885, "grad_norm": 0.5912639182099809, "learning_rate": 7.696792927217266e-06, "loss": 0.6074, "num_tokens": 53621069.0, "step": 484 }, { "epoch": 3.5682656826568264, "grad_norm": 0.7008824003925574, "learning_rate": 7.687360984605705e-06, "loss": 0.5761, "num_tokens": 53713397.0, "step": 485 }, { "epoch": 3.5756457564575648, "grad_norm": 0.640550494460841, "learning_rate": 7.677916442859843e-06, "loss": 0.5743, "num_tokens": 53811627.0, "step": 486 }, { "epoch": 3.5830258302583027, "grad_norm": 0.697013155404617, "learning_rate": 7.66845935637996e-06, "loss": 0.5367, "num_tokens": 53910389.0, "step": 487 }, { "epoch": 3.5904059040590406, "grad_norm": 0.6260702071794642, "learning_rate": 7.658989779638599e-06, "loss": 0.5189, "num_tokens": 54014073.0, "step": 488 }, { "epoch": 3.5977859778597785, "grad_norm": 0.5872641163913606, "learning_rate": 7.649507767180233e-06, "loss": 0.5334, "num_tokens": 54125732.0, "step": 489 }, { "epoch": 3.6051660516605164, "grad_norm": 0.5706512850075195, "learning_rate": 7.64001337362098e-06, "loss": 0.547, "num_tokens": 54245332.0, "step": 490 }, { "epoch": 3.6125461254612548, "grad_norm": 0.6762724541633397, "learning_rate": 7.630506653648257e-06, "loss": 0.466, "num_tokens": 54342768.0, "step": 491 }, { "epoch": 3.6199261992619927, "grad_norm": 0.7062756340518096, "learning_rate": 7.620987662020495e-06, "loss": 0.5768, "num_tokens": 54428990.0, "step": 492 }, { "epoch": 3.6273062730627306, "grad_norm": 0.7303709118580136, "learning_rate": 7.611456453566799e-06, "loss": 0.613, "num_tokens": 54513201.0, "step": 493 }, { "epoch": 3.6346863468634685, "grad_norm": 0.7786447083255501, "learning_rate": 7.601913083186648e-06, "loss": 0.5683, "num_tokens": 54602874.0, "step": 494 }, { "epoch": 3.6420664206642064, "grad_norm": 0.6574885010125108, "learning_rate": 7.592357605849572e-06, "loss": 0.5448, "num_tokens": 54688807.0, "step": 495 }, { "epoch": 3.6494464944649447, "grad_norm": 0.8591301030957893, "learning_rate": 7.582790076594836e-06, "loss": 0.6044, "num_tokens": 54772709.0, "step": 496 }, { "epoch": 3.6568265682656826, "grad_norm": 0.6372147818190681, "learning_rate": 7.573210550531126e-06, "loss": 0.5889, "num_tokens": 54904581.0, "step": 497 }, { "epoch": 3.6642066420664205, "grad_norm": 0.5816280072454586, "learning_rate": 7.563619082836225e-06, "loss": 0.5659, "num_tokens": 55055097.0, "step": 498 }, { "epoch": 3.671586715867159, "grad_norm": 0.7373811135417339, "learning_rate": 7.554015728756705e-06, "loss": 0.5186, "num_tokens": 55144312.0, "step": 499 }, { "epoch": 3.678966789667897, "grad_norm": 0.6664043352066058, "learning_rate": 7.544400543607599e-06, "loss": 0.568, "num_tokens": 55242571.0, "step": 500 }, { "epoch": 3.6863468634686347, "grad_norm": 0.6349135676386292, "learning_rate": 7.534773582772087e-06, "loss": 0.5322, "num_tokens": 55334546.0, "step": 501 }, { "epoch": 3.6937269372693726, "grad_norm": 0.6862750480162775, "learning_rate": 7.525134901701178e-06, "loss": 0.5333, "num_tokens": 55437983.0, "step": 502 }, { "epoch": 3.7011070110701105, "grad_norm": 0.6348808197020509, "learning_rate": 7.515484555913388e-06, "loss": 0.5853, "num_tokens": 55555490.0, "step": 503 }, { "epoch": 3.708487084870849, "grad_norm": 0.6129333693971603, "learning_rate": 7.5058226009944235e-06, "loss": 0.4703, "num_tokens": 55667192.0, "step": 504 }, { "epoch": 3.715867158671587, "grad_norm": 0.6151258674669042, "learning_rate": 7.496149092596856e-06, "loss": 0.6096, "num_tokens": 55780741.0, "step": 505 }, { "epoch": 3.7232472324723247, "grad_norm": 0.6083067464728237, "learning_rate": 7.48646408643981e-06, "loss": 0.496, "num_tokens": 55913521.0, "step": 506 }, { "epoch": 3.7306273062730626, "grad_norm": 0.6108105406176739, "learning_rate": 7.476767638308628e-06, "loss": 0.4575, "num_tokens": 56042599.0, "step": 507 }, { "epoch": 3.7380073800738005, "grad_norm": 0.6366911234823489, "learning_rate": 7.467059804054567e-06, "loss": 0.9373, "num_tokens": 56174558.0, "step": 508 }, { "epoch": 3.745387453874539, "grad_norm": 0.5652847911595245, "learning_rate": 7.457340639594463e-06, "loss": 0.4751, "num_tokens": 56297335.0, "step": 509 }, { "epoch": 3.7527675276752768, "grad_norm": 0.6336279282262137, "learning_rate": 7.447610200910417e-06, "loss": 0.5882, "num_tokens": 56417100.0, "step": 510 }, { "epoch": 3.7601476014760147, "grad_norm": 0.6565595172181139, "learning_rate": 7.437868544049464e-06, "loss": 0.5453, "num_tokens": 56513620.0, "step": 511 }, { "epoch": 3.767527675276753, "grad_norm": 0.6883459474029607, "learning_rate": 7.428115725123256e-06, "loss": 0.4716, "num_tokens": 56628754.0, "step": 512 }, { "epoch": 3.774907749077491, "grad_norm": 0.5859063630882125, "learning_rate": 7.4183518003077445e-06, "loss": 0.5763, "num_tokens": 56720718.0, "step": 513 }, { "epoch": 3.782287822878229, "grad_norm": 0.6638307638429727, "learning_rate": 7.408576825842845e-06, "loss": 0.5437, "num_tokens": 56813942.0, "step": 514 }, { "epoch": 3.7896678966789668, "grad_norm": 0.650844785515401, "learning_rate": 7.39879085803212e-06, "loss": 0.5295, "num_tokens": 56933381.0, "step": 515 }, { "epoch": 3.7970479704797047, "grad_norm": 0.6960337012516411, "learning_rate": 7.388993953242453e-06, "loss": 0.5942, "num_tokens": 57031898.0, "step": 516 }, { "epoch": 3.804428044280443, "grad_norm": 0.6013857034775665, "learning_rate": 7.379186167903726e-06, "loss": 0.8671, "num_tokens": 57159544.0, "step": 517 }, { "epoch": 3.811808118081181, "grad_norm": 0.6324109984635025, "learning_rate": 7.36936755850849e-06, "loss": 0.5234, "num_tokens": 57262915.0, "step": 518 }, { "epoch": 3.819188191881919, "grad_norm": 0.6543810076842617, "learning_rate": 7.359538181611643e-06, "loss": 0.554, "num_tokens": 57369864.0, "step": 519 }, { "epoch": 3.8265682656826567, "grad_norm": 0.6516252107608712, "learning_rate": 7.349698093830106e-06, "loss": 0.5926, "num_tokens": 57474391.0, "step": 520 }, { "epoch": 3.8339483394833946, "grad_norm": 0.6137494004803659, "learning_rate": 7.3398473518424886e-06, "loss": 0.5325, "num_tokens": 57578201.0, "step": 521 }, { "epoch": 3.841328413284133, "grad_norm": 0.6303464613399665, "learning_rate": 7.329986012388775e-06, "loss": 0.5099, "num_tokens": 57697852.0, "step": 522 }, { "epoch": 3.848708487084871, "grad_norm": 0.568654398023787, "learning_rate": 7.320114132269988e-06, "loss": 0.5581, "num_tokens": 57835634.0, "step": 523 }, { "epoch": 3.856088560885609, "grad_norm": 0.5934669338639725, "learning_rate": 7.310231768347862e-06, "loss": 0.5822, "num_tokens": 57909013.0, "step": 524 }, { "epoch": 3.8634686346863467, "grad_norm": 0.7395411253378467, "learning_rate": 7.30033897754452e-06, "loss": 0.5455, "num_tokens": 58030797.0, "step": 525 }, { "epoch": 3.8708487084870846, "grad_norm": 0.6244514128434226, "learning_rate": 7.290435816842144e-06, "loss": 0.577, "num_tokens": 58126913.0, "step": 526 }, { "epoch": 3.878228782287823, "grad_norm": 0.6728134092311296, "learning_rate": 7.280522343282647e-06, "loss": 0.516, "num_tokens": 58236344.0, "step": 527 }, { "epoch": 3.885608856088561, "grad_norm": 0.6290426751390251, "learning_rate": 7.270598613967339e-06, "loss": 0.4682, "num_tokens": 58353595.0, "step": 528 }, { "epoch": 3.892988929889299, "grad_norm": 0.6628163607802886, "learning_rate": 7.260664686056606e-06, "loss": 0.5101, "num_tokens": 58496620.0, "step": 529 }, { "epoch": 3.900369003690037, "grad_norm": 0.6254928081028875, "learning_rate": 7.250720616769581e-06, "loss": 0.5779, "num_tokens": 58582092.0, "step": 530 }, { "epoch": 3.907749077490775, "grad_norm": 0.6720141890936548, "learning_rate": 7.2407664633838035e-06, "loss": 0.4887, "num_tokens": 58695778.0, "step": 531 }, { "epoch": 3.915129151291513, "grad_norm": 0.6378774459005809, "learning_rate": 7.230802283234905e-06, "loss": 0.5153, "num_tokens": 58809203.0, "step": 532 }, { "epoch": 3.922509225092251, "grad_norm": 0.5348202191069997, "learning_rate": 7.220828133716268e-06, "loss": 0.5236, "num_tokens": 58933023.0, "step": 533 }, { "epoch": 3.9298892988929888, "grad_norm": 0.6866864951443549, "learning_rate": 7.210844072278694e-06, "loss": 0.5422, "num_tokens": 59030016.0, "step": 534 }, { "epoch": 3.937269372693727, "grad_norm": 0.6456241271796205, "learning_rate": 7.20085015643008e-06, "loss": 0.5258, "num_tokens": 59140666.0, "step": 535 }, { "epoch": 3.944649446494465, "grad_norm": 0.5954729482509249, "learning_rate": 7.190846443735088e-06, "loss": 0.546, "num_tokens": 59282134.0, "step": 536 }, { "epoch": 3.952029520295203, "grad_norm": 0.5907445946022006, "learning_rate": 7.180832991814802e-06, "loss": 0.4798, "num_tokens": 59378299.0, "step": 537 }, { "epoch": 3.959409594095941, "grad_norm": 0.6770695720932642, "learning_rate": 7.170809858346413e-06, "loss": 0.579, "num_tokens": 59504160.0, "step": 538 }, { "epoch": 3.9667896678966788, "grad_norm": 0.5845919920047277, "learning_rate": 7.160777101062866e-06, "loss": 0.496, "num_tokens": 59610918.0, "step": 539 }, { "epoch": 3.974169741697417, "grad_norm": 0.6342011369220734, "learning_rate": 7.150734777752547e-06, "loss": 0.6157, "num_tokens": 59710685.0, "step": 540 }, { "epoch": 3.981549815498155, "grad_norm": 0.7509567878858718, "learning_rate": 7.140682946258942e-06, "loss": 0.5196, "num_tokens": 59800753.0, "step": 541 }, { "epoch": 3.988929889298893, "grad_norm": 0.617408380954175, "learning_rate": 7.130621664480301e-06, "loss": 0.5888, "num_tokens": 59929772.0, "step": 542 }, { "epoch": 3.9963099630996313, "grad_norm": 0.616688560156464, "learning_rate": 7.1205509903693084e-06, "loss": 0.6072, "num_tokens": 60027785.0, "step": 543 }, { "epoch": 4.0, "grad_norm": 1.077119931873448, "learning_rate": 7.1104709819327455e-06, "loss": 0.4023, "num_tokens": 60072523.0, "step": 544 }, { "epoch": 4.007380073800738, "grad_norm": 0.8185549681071511, "learning_rate": 7.1003816972311636e-06, "loss": 0.3787, "num_tokens": 60178213.0, "step": 545 }, { "epoch": 4.014760147601476, "grad_norm": 0.6879287599827005, "learning_rate": 7.090283194378544e-06, "loss": 0.4793, "num_tokens": 60343840.0, "step": 546 }, { "epoch": 4.022140221402214, "grad_norm": 0.7246888092805986, "learning_rate": 7.0801755315419595e-06, "loss": 0.4101, "num_tokens": 60439988.0, "step": 547 }, { "epoch": 4.029520295202952, "grad_norm": 0.5455761792313781, "learning_rate": 7.070058766941251e-06, "loss": 0.3616, "num_tokens": 60565853.0, "step": 548 }, { "epoch": 4.03690036900369, "grad_norm": 0.7686756616172389, "learning_rate": 7.05993295884868e-06, "loss": 0.493, "num_tokens": 60681326.0, "step": 549 }, { "epoch": 4.044280442804428, "grad_norm": 1.042380896202571, "learning_rate": 7.049798165588603e-06, "loss": 0.4094, "num_tokens": 60788082.0, "step": 550 }, { "epoch": 4.051660516605166, "grad_norm": 1.0128125858080455, "learning_rate": 7.039654445537126e-06, "loss": 0.421, "num_tokens": 60893178.0, "step": 551 }, { "epoch": 4.059040590405904, "grad_norm": 0.8866696577371358, "learning_rate": 7.029501857121776e-06, "loss": 0.3882, "num_tokens": 61007793.0, "step": 552 }, { "epoch": 4.0664206642066425, "grad_norm": 0.7827477640476983, "learning_rate": 7.01934045882116e-06, "loss": 0.3865, "num_tokens": 61148858.0, "step": 553 }, { "epoch": 4.07380073800738, "grad_norm": 0.7403004236589705, "learning_rate": 7.009170309164631e-06, "loss": 0.4022, "num_tokens": 61256572.0, "step": 554 }, { "epoch": 4.081180811808118, "grad_norm": 0.773274412714197, "learning_rate": 6.9989914667319495e-06, "loss": 0.4321, "num_tokens": 61348777.0, "step": 555 }, { "epoch": 4.088560885608856, "grad_norm": 0.6936826874938206, "learning_rate": 6.988803990152944e-06, "loss": 0.3726, "num_tokens": 61463473.0, "step": 556 }, { "epoch": 4.095940959409594, "grad_norm": 0.6364735122936085, "learning_rate": 6.978607938107177e-06, "loss": 0.3764, "num_tokens": 61608501.0, "step": 557 }, { "epoch": 4.1033210332103325, "grad_norm": 0.6736554158825293, "learning_rate": 6.968403369323607e-06, "loss": 0.3921, "num_tokens": 61751791.0, "step": 558 }, { "epoch": 4.11070110701107, "grad_norm": 0.6192184760350499, "learning_rate": 6.958190342580248e-06, "loss": 0.3915, "num_tokens": 61887652.0, "step": 559 }, { "epoch": 4.118081180811808, "grad_norm": 0.6154621603693199, "learning_rate": 6.9479689167038265e-06, "loss": 0.3364, "num_tokens": 62008678.0, "step": 560 }, { "epoch": 4.125461254612546, "grad_norm": 0.7049771082005446, "learning_rate": 6.937739150569455e-06, "loss": 0.4111, "num_tokens": 62158409.0, "step": 561 }, { "epoch": 4.132841328413284, "grad_norm": 0.7181167850144017, "learning_rate": 6.927501103100284e-06, "loss": 0.3484, "num_tokens": 62271321.0, "step": 562 }, { "epoch": 4.1402214022140225, "grad_norm": 0.794654931511997, "learning_rate": 6.91725483326716e-06, "loss": 0.3911, "num_tokens": 62356715.0, "step": 563 }, { "epoch": 4.14760147601476, "grad_norm": 0.7907185130050475, "learning_rate": 6.907000400088293e-06, "loss": 0.3621, "num_tokens": 62445912.0, "step": 564 }, { "epoch": 4.154981549815498, "grad_norm": 0.7203231878606359, "learning_rate": 6.896737862628914e-06, "loss": 0.3761, "num_tokens": 62552929.0, "step": 565 }, { "epoch": 4.162361623616236, "grad_norm": 0.7850729901068995, "learning_rate": 6.886467280000935e-06, "loss": 0.4343, "num_tokens": 62695274.0, "step": 566 }, { "epoch": 4.169741697416974, "grad_norm": 0.7268275265837512, "learning_rate": 6.876188711362604e-06, "loss": 0.3882, "num_tokens": 62820912.0, "step": 567 }, { "epoch": 4.177121771217712, "grad_norm": 0.687720731041873, "learning_rate": 6.865902215918175e-06, "loss": 0.4819, "num_tokens": 62936836.0, "step": 568 }, { "epoch": 4.18450184501845, "grad_norm": 0.6633965124703752, "learning_rate": 6.855607852917555e-06, "loss": 0.3969, "num_tokens": 63069777.0, "step": 569 }, { "epoch": 4.191881918819188, "grad_norm": 0.7588942643609261, "learning_rate": 6.845305681655967e-06, "loss": 0.403, "num_tokens": 63159681.0, "step": 570 }, { "epoch": 4.199261992619927, "grad_norm": 0.6347494522478722, "learning_rate": 6.834995761473614e-06, "loss": 0.4054, "num_tokens": 63279217.0, "step": 571 }, { "epoch": 4.206642066420664, "grad_norm": 0.6614862018835697, "learning_rate": 6.824678151755328e-06, "loss": 0.4541, "num_tokens": 63417206.0, "step": 572 }, { "epoch": 4.214022140221402, "grad_norm": 0.6903189153624362, "learning_rate": 6.814352911930236e-06, "loss": 0.4138, "num_tokens": 63537780.0, "step": 573 }, { "epoch": 4.22140221402214, "grad_norm": 0.6693782968512514, "learning_rate": 6.8040201014714115e-06, "loss": 0.3367, "num_tokens": 63638489.0, "step": 574 }, { "epoch": 4.228782287822878, "grad_norm": 0.752241187045228, "learning_rate": 6.793679779895538e-06, "loss": 0.4096, "num_tokens": 63741860.0, "step": 575 }, { "epoch": 4.236162361623617, "grad_norm": 0.7282925415266427, "learning_rate": 6.783332006762556e-06, "loss": 0.9501, "num_tokens": 63870304.0, "step": 576 }, { "epoch": 4.243542435424354, "grad_norm": 0.8245230569269095, "learning_rate": 6.772976841675337e-06, "loss": 0.4047, "num_tokens": 63958107.0, "step": 577 }, { "epoch": 4.250922509225092, "grad_norm": 0.9844307444354672, "learning_rate": 6.76261434427932e-06, "loss": 0.4605, "num_tokens": 64070823.0, "step": 578 }, { "epoch": 4.25830258302583, "grad_norm": 0.7565155101273008, "learning_rate": 6.752244574262186e-06, "loss": 0.4045, "num_tokens": 64164408.0, "step": 579 }, { "epoch": 4.265682656826568, "grad_norm": 0.7759330833778824, "learning_rate": 6.741867591353498e-06, "loss": 0.4147, "num_tokens": 64273354.0, "step": 580 }, { "epoch": 4.273062730627307, "grad_norm": 0.6130687740196629, "learning_rate": 6.731483455324374e-06, "loss": 0.3457, "num_tokens": 64391822.0, "step": 581 }, { "epoch": 4.280442804428044, "grad_norm": 0.6339762466431851, "learning_rate": 6.7210922259871245e-06, "loss": 0.3936, "num_tokens": 64514968.0, "step": 582 }, { "epoch": 4.287822878228782, "grad_norm": 0.6391658581598267, "learning_rate": 6.710693963194925e-06, "loss": 0.3848, "num_tokens": 64631212.0, "step": 583 }, { "epoch": 4.29520295202952, "grad_norm": 0.7358101426442469, "learning_rate": 6.7002887268414595e-06, "loss": 0.3717, "num_tokens": 64730478.0, "step": 584 }, { "epoch": 4.302583025830258, "grad_norm": 0.8027379283621318, "learning_rate": 6.68987657686058e-06, "loss": 0.4161, "num_tokens": 64830731.0, "step": 585 }, { "epoch": 4.3099630996309966, "grad_norm": 0.6477689552553441, "learning_rate": 6.679457573225961e-06, "loss": 0.3685, "num_tokens": 64942597.0, "step": 586 }, { "epoch": 4.317343173431734, "grad_norm": 0.7229661768785778, "learning_rate": 6.669031775950754e-06, "loss": 0.3956, "num_tokens": 65049820.0, "step": 587 }, { "epoch": 4.324723247232472, "grad_norm": 0.7471828751844891, "learning_rate": 6.658599245087242e-06, "loss": 0.3206, "num_tokens": 65133098.0, "step": 588 }, { "epoch": 4.332103321033211, "grad_norm": 0.7548872209570942, "learning_rate": 6.6481600407264926e-06, "loss": 0.4157, "num_tokens": 65240629.0, "step": 589 }, { "epoch": 4.339483394833948, "grad_norm": 0.8223112816350289, "learning_rate": 6.637714222998013e-06, "loss": 0.4239, "num_tokens": 65325662.0, "step": 590 }, { "epoch": 4.3468634686346865, "grad_norm": 0.6679953036237622, "learning_rate": 6.627261852069402e-06, "loss": 0.3428, "num_tokens": 65428326.0, "step": 591 }, { "epoch": 4.354243542435424, "grad_norm": 0.7583322535402786, "learning_rate": 6.616802988146008e-06, "loss": 0.3832, "num_tokens": 65538706.0, "step": 592 }, { "epoch": 4.361623616236162, "grad_norm": 0.673827305891719, "learning_rate": 6.606337691470575e-06, "loss": 0.398, "num_tokens": 65684900.0, "step": 593 }, { "epoch": 4.369003690036901, "grad_norm": 0.6675827806343153, "learning_rate": 6.595866022322901e-06, "loss": 0.3812, "num_tokens": 65781453.0, "step": 594 }, { "epoch": 4.376383763837638, "grad_norm": 0.6942316723004445, "learning_rate": 6.585388041019488e-06, "loss": 0.4026, "num_tokens": 65892895.0, "step": 595 }, { "epoch": 4.3837638376383765, "grad_norm": 0.6709103394389789, "learning_rate": 6.574903807913201e-06, "loss": 0.3742, "num_tokens": 66008234.0, "step": 596 }, { "epoch": 4.391143911439114, "grad_norm": 0.6905786327427121, "learning_rate": 6.5644133833929065e-06, "loss": 0.3655, "num_tokens": 66109214.0, "step": 597 }, { "epoch": 4.398523985239852, "grad_norm": 0.7557820237247774, "learning_rate": 6.553916827883142e-06, "loss": 0.47, "num_tokens": 66217645.0, "step": 598 }, { "epoch": 4.405904059040591, "grad_norm": 0.7156665158820491, "learning_rate": 6.543414201843753e-06, "loss": 0.4587, "num_tokens": 66351675.0, "step": 599 }, { "epoch": 4.413284132841328, "grad_norm": 0.7314287717700599, "learning_rate": 6.532905565769556e-06, "loss": 0.3386, "num_tokens": 66442708.0, "step": 600 }, { "epoch": 4.4206642066420665, "grad_norm": 0.7399482793307474, "learning_rate": 6.52239098018998e-06, "loss": 0.4569, "num_tokens": 66560923.0, "step": 601 }, { "epoch": 4.428044280442805, "grad_norm": 0.634745317658342, "learning_rate": 6.511870505668726e-06, "loss": 0.3599, "num_tokens": 66683186.0, "step": 602 }, { "epoch": 4.435424354243542, "grad_norm": 0.7047160432707139, "learning_rate": 6.501344202803415e-06, "loss": 0.362, "num_tokens": 66798415.0, "step": 603 }, { "epoch": 4.442804428044281, "grad_norm": 0.6775990917406843, "learning_rate": 6.490812132225241e-06, "loss": 0.4059, "num_tokens": 66934969.0, "step": 604 }, { "epoch": 4.450184501845018, "grad_norm": 0.8094929699161325, "learning_rate": 6.480274354598615e-06, "loss": 0.4689, "num_tokens": 67027774.0, "step": 605 }, { "epoch": 4.4575645756457565, "grad_norm": 0.8226454567493683, "learning_rate": 6.469730930620824e-06, "loss": 0.3639, "num_tokens": 67119023.0, "step": 606 }, { "epoch": 4.464944649446495, "grad_norm": 0.6924009621937695, "learning_rate": 6.459181921021676e-06, "loss": 0.3743, "num_tokens": 67217382.0, "step": 607 }, { "epoch": 4.472324723247232, "grad_norm": 0.7349983289292954, "learning_rate": 6.448627386563155e-06, "loss": 0.4336, "num_tokens": 67318097.0, "step": 608 }, { "epoch": 4.479704797047971, "grad_norm": 0.6794249119652119, "learning_rate": 6.438067388039065e-06, "loss": 0.3629, "num_tokens": 67434604.0, "step": 609 }, { "epoch": 4.487084870848708, "grad_norm": 0.8646818271476965, "learning_rate": 6.427501986274684e-06, "loss": 0.4263, "num_tokens": 67512963.0, "step": 610 }, { "epoch": 4.4944649446494465, "grad_norm": 0.7568121700794729, "learning_rate": 6.41693124212641e-06, "loss": 0.3675, "num_tokens": 67614456.0, "step": 611 }, { "epoch": 4.501845018450185, "grad_norm": 0.7672043222057289, "learning_rate": 6.40635521648142e-06, "loss": 0.3904, "num_tokens": 67699709.0, "step": 612 }, { "epoch": 4.509225092250922, "grad_norm": 0.7511345874881337, "learning_rate": 6.395773970257303e-06, "loss": 0.3599, "num_tokens": 67804127.0, "step": 613 }, { "epoch": 4.516605166051661, "grad_norm": 0.6867131065685464, "learning_rate": 6.385187564401727e-06, "loss": 0.353, "num_tokens": 67903764.0, "step": 614 }, { "epoch": 4.523985239852399, "grad_norm": 0.6710396335014553, "learning_rate": 6.374596059892073e-06, "loss": 0.6348, "num_tokens": 68034031.0, "step": 615 }, { "epoch": 4.531365313653136, "grad_norm": 0.8625075936358634, "learning_rate": 6.363999517735091e-06, "loss": 0.722, "num_tokens": 68148463.0, "step": 616 }, { "epoch": 4.538745387453875, "grad_norm": 0.6800388785943237, "learning_rate": 6.353397998966551e-06, "loss": 0.4106, "num_tokens": 68267729.0, "step": 617 }, { "epoch": 4.546125461254612, "grad_norm": 0.7271259266704767, "learning_rate": 6.342791564650886e-06, "loss": 0.4337, "num_tokens": 68382722.0, "step": 618 }, { "epoch": 4.553505535055351, "grad_norm": 0.7502362722709528, "learning_rate": 6.332180275880843e-06, "loss": 0.4248, "num_tokens": 68475804.0, "step": 619 }, { "epoch": 4.560885608856088, "grad_norm": 0.7333546816412613, "learning_rate": 6.321564193777129e-06, "loss": 0.3865, "num_tokens": 68570574.0, "step": 620 }, { "epoch": 4.568265682656826, "grad_norm": 0.8558187774009335, "learning_rate": 6.310943379488061e-06, "loss": 0.4161, "num_tokens": 68668870.0, "step": 621 }, { "epoch": 4.575645756457565, "grad_norm": 0.7552469581133401, "learning_rate": 6.3003178941892165e-06, "loss": 0.3005, "num_tokens": 68760744.0, "step": 622 }, { "epoch": 4.583025830258302, "grad_norm": 0.5928004896884772, "learning_rate": 6.289687799083073e-06, "loss": 0.3343, "num_tokens": 68879055.0, "step": 623 }, { "epoch": 4.590405904059041, "grad_norm": 0.7523382356997574, "learning_rate": 6.279053155398663e-06, "loss": 0.4567, "num_tokens": 68972393.0, "step": 624 }, { "epoch": 4.597785977859779, "grad_norm": 0.7371578316979396, "learning_rate": 6.268414024391218e-06, "loss": 0.4588, "num_tokens": 69073371.0, "step": 625 }, { "epoch": 4.605166051660516, "grad_norm": 0.8698095075929976, "learning_rate": 6.2577704673418195e-06, "loss": 0.4769, "num_tokens": 69147682.0, "step": 626 }, { "epoch": 4.612546125461255, "grad_norm": 0.6516647547945007, "learning_rate": 6.247122545557036e-06, "loss": 0.385, "num_tokens": 69271135.0, "step": 627 }, { "epoch": 4.619926199261993, "grad_norm": 0.7705970000583697, "learning_rate": 6.236470320368582e-06, "loss": 0.3982, "num_tokens": 69379315.0, "step": 628 }, { "epoch": 4.627306273062731, "grad_norm": 0.8054329722423857, "learning_rate": 6.2258138531329595e-06, "loss": 0.4099, "num_tokens": 69471071.0, "step": 629 }, { "epoch": 4.634686346863469, "grad_norm": 0.8582480575625172, "learning_rate": 6.2151532052311e-06, "loss": 0.4169, "num_tokens": 69546670.0, "step": 630 }, { "epoch": 4.642066420664206, "grad_norm": 0.8348601482706759, "learning_rate": 6.204488438068021e-06, "loss": 0.448, "num_tokens": 69642136.0, "step": 631 }, { "epoch": 4.649446494464945, "grad_norm": 0.6430948321517582, "learning_rate": 6.193819613072467e-06, "loss": 0.4438, "num_tokens": 69832108.0, "step": 632 }, { "epoch": 4.656826568265682, "grad_norm": 0.6425387764210215, "learning_rate": 6.183146791696549e-06, "loss": 0.3639, "num_tokens": 69942356.0, "step": 633 }, { "epoch": 4.6642066420664205, "grad_norm": 0.6934609339884044, "learning_rate": 6.172470035415403e-06, "loss": 0.3851, "num_tokens": 70061692.0, "step": 634 }, { "epoch": 4.671586715867159, "grad_norm": 0.716829048974258, "learning_rate": 6.1617894057268276e-06, "loss": 0.3516, "num_tokens": 70168745.0, "step": 635 }, { "epoch": 4.678966789667896, "grad_norm": 0.7743408601424967, "learning_rate": 6.151104964150932e-06, "loss": 0.4137, "num_tokens": 70262643.0, "step": 636 }, { "epoch": 4.686346863468635, "grad_norm": 0.6771485702327544, "learning_rate": 6.140416772229785e-06, "loss": 0.4256, "num_tokens": 70383350.0, "step": 637 }, { "epoch": 4.693726937269373, "grad_norm": 0.6886339783733789, "learning_rate": 6.129724891527049e-06, "loss": 0.4286, "num_tokens": 70531946.0, "step": 638 }, { "epoch": 4.7011070110701105, "grad_norm": 0.7102941718952961, "learning_rate": 6.119029383627645e-06, "loss": 0.3719, "num_tokens": 70643122.0, "step": 639 }, { "epoch": 4.708487084870849, "grad_norm": 0.6907839812903495, "learning_rate": 6.108330310137379e-06, "loss": 0.3986, "num_tokens": 70741793.0, "step": 640 }, { "epoch": 4.715867158671586, "grad_norm": 0.6689514709397577, "learning_rate": 6.097627732682596e-06, "loss": 0.4467, "num_tokens": 70864577.0, "step": 641 }, { "epoch": 4.723247232472325, "grad_norm": 0.6774349195540061, "learning_rate": 6.086921712909824e-06, "loss": 0.3608, "num_tokens": 70983590.0, "step": 642 }, { "epoch": 4.730627306273063, "grad_norm": 0.644666936201413, "learning_rate": 6.076212312485419e-06, "loss": 0.4958, "num_tokens": 71099310.0, "step": 643 }, { "epoch": 4.7380073800738005, "grad_norm": 0.6965330619998724, "learning_rate": 6.0654995930952085e-06, "loss": 0.4535, "num_tokens": 71227732.0, "step": 644 }, { "epoch": 4.745387453874539, "grad_norm": 0.7391158181982388, "learning_rate": 6.054783616444141e-06, "loss": 0.4149, "num_tokens": 71344542.0, "step": 645 }, { "epoch": 4.752767527675276, "grad_norm": 0.7092144938171325, "learning_rate": 6.044064444255921e-06, "loss": 0.3611, "num_tokens": 71450018.0, "step": 646 }, { "epoch": 4.760147601476015, "grad_norm": 0.7822866247408168, "learning_rate": 6.033342138272663e-06, "loss": 0.3855, "num_tokens": 71532016.0, "step": 647 }, { "epoch": 4.767527675276753, "grad_norm": 0.6949716125906905, "learning_rate": 6.0226167602545296e-06, "loss": 0.4147, "num_tokens": 71644441.0, "step": 648 }, { "epoch": 4.7749077490774905, "grad_norm": 0.7030059314502649, "learning_rate": 6.01188837197938e-06, "loss": 0.3824, "num_tokens": 71745511.0, "step": 649 }, { "epoch": 4.782287822878229, "grad_norm": 0.6864412003773923, "learning_rate": 6.001157035242415e-06, "loss": 0.3907, "num_tokens": 71858712.0, "step": 650 }, { "epoch": 4.789667896678967, "grad_norm": 0.7049625591277209, "learning_rate": 5.9904228118558126e-06, "loss": 0.3608, "num_tokens": 71947709.0, "step": 651 }, { "epoch": 4.797047970479705, "grad_norm": 0.8213761554544584, "learning_rate": 5.979685763648381e-06, "loss": 0.4346, "num_tokens": 72031618.0, "step": 652 }, { "epoch": 4.804428044280443, "grad_norm": 0.6808314414773919, "learning_rate": 5.968945952465199e-06, "loss": 0.3338, "num_tokens": 72166482.0, "step": 653 }, { "epoch": 4.8118081180811805, "grad_norm": 0.6746618582832787, "learning_rate": 5.958203440167261e-06, "loss": 0.4232, "num_tokens": 72291372.0, "step": 654 }, { "epoch": 4.819188191881919, "grad_norm": 0.8480120297387668, "learning_rate": 5.947458288631117e-06, "loss": 0.4583, "num_tokens": 72399830.0, "step": 655 }, { "epoch": 4.826568265682657, "grad_norm": 0.7251109084368389, "learning_rate": 5.936710559748521e-06, "loss": 0.4565, "num_tokens": 72503354.0, "step": 656 }, { "epoch": 4.833948339483395, "grad_norm": 0.7006893405079634, "learning_rate": 5.925960315426072e-06, "loss": 0.4161, "num_tokens": 72624460.0, "step": 657 }, { "epoch": 4.841328413284133, "grad_norm": 0.6880629191287833, "learning_rate": 5.915207617584859e-06, "loss": 0.3444, "num_tokens": 72741862.0, "step": 658 }, { "epoch": 4.8487084870848705, "grad_norm": 0.6383251279558573, "learning_rate": 5.904452528160104e-06, "loss": 0.4148, "num_tokens": 72883296.0, "step": 659 }, { "epoch": 4.856088560885609, "grad_norm": 0.7516182787280705, "learning_rate": 5.893695109100798e-06, "loss": 0.3921, "num_tokens": 72994318.0, "step": 660 }, { "epoch": 4.863468634686347, "grad_norm": 0.7662024035456283, "learning_rate": 5.882935422369359e-06, "loss": 0.4267, "num_tokens": 73087117.0, "step": 661 }, { "epoch": 4.870848708487085, "grad_norm": 0.8083001695801738, "learning_rate": 5.872173529941261e-06, "loss": 0.4142, "num_tokens": 73192480.0, "step": 662 }, { "epoch": 4.878228782287823, "grad_norm": 0.699125650675328, "learning_rate": 5.861409493804686e-06, "loss": 0.4136, "num_tokens": 73297176.0, "step": 663 }, { "epoch": 4.885608856088561, "grad_norm": 0.6535158624821691, "learning_rate": 5.850643375960161e-06, "loss": 0.387, "num_tokens": 73430941.0, "step": 664 }, { "epoch": 4.892988929889299, "grad_norm": 0.6861981476949671, "learning_rate": 5.839875238420206e-06, "loss": 0.4156, "num_tokens": 73555649.0, "step": 665 }, { "epoch": 4.900369003690037, "grad_norm": 0.7059838206641451, "learning_rate": 5.829105143208973e-06, "loss": 0.3595, "num_tokens": 73649936.0, "step": 666 }, { "epoch": 4.907749077490775, "grad_norm": 0.7876415348592959, "learning_rate": 5.818333152361891e-06, "loss": 0.4678, "num_tokens": 73769910.0, "step": 667 }, { "epoch": 4.915129151291513, "grad_norm": 0.7843312715366683, "learning_rate": 5.807559327925307e-06, "loss": 0.3994, "num_tokens": 73870043.0, "step": 668 }, { "epoch": 4.922509225092251, "grad_norm": 0.8266922375686809, "learning_rate": 5.79678373195613e-06, "loss": 0.4432, "num_tokens": 73968723.0, "step": 669 }, { "epoch": 4.929889298892989, "grad_norm": 0.7553323604194044, "learning_rate": 5.786006426521473e-06, "loss": 0.3661, "num_tokens": 74074339.0, "step": 670 }, { "epoch": 4.937269372693727, "grad_norm": 0.8281643428151172, "learning_rate": 5.775227473698294e-06, "loss": 0.4633, "num_tokens": 74184933.0, "step": 671 }, { "epoch": 4.944649446494465, "grad_norm": 0.8028164536285961, "learning_rate": 5.7644469355730414e-06, "loss": 0.4214, "num_tokens": 74274840.0, "step": 672 }, { "epoch": 4.952029520295203, "grad_norm": 0.8092839757824163, "learning_rate": 5.753664874241295e-06, "loss": 0.4089, "num_tokens": 74361517.0, "step": 673 }, { "epoch": 4.959409594095941, "grad_norm": 0.774685434447823, "learning_rate": 5.7428813518074065e-06, "loss": 0.3799, "num_tokens": 74446520.0, "step": 674 }, { "epoch": 4.966789667896679, "grad_norm": 0.643089944875341, "learning_rate": 5.732096430384148e-06, "loss": 0.8404, "num_tokens": 74627275.0, "step": 675 }, { "epoch": 4.974169741697417, "grad_norm": 0.7416708152229656, "learning_rate": 5.7213101720923425e-06, "loss": 0.3653, "num_tokens": 74740803.0, "step": 676 }, { "epoch": 4.9815498154981555, "grad_norm": 0.7906004267986075, "learning_rate": 5.710522639060521e-06, "loss": 0.4038, "num_tokens": 74831156.0, "step": 677 }, { "epoch": 4.988929889298893, "grad_norm": 0.8357493566883484, "learning_rate": 5.6997338934245505e-06, "loss": 0.4747, "num_tokens": 74945585.0, "step": 678 }, { "epoch": 4.996309963099631, "grad_norm": 0.738553121534108, "learning_rate": 5.6889439973272886e-06, "loss": 0.371, "num_tokens": 75043311.0, "step": 679 }, { "epoch": 5.0, "grad_norm": 0.738553121534108, "learning_rate": 5.678153012918214e-06, "loss": 0.3903, "num_tokens": 75087882.0, "step": 680 }, { "epoch": 5.007380073800738, "grad_norm": 1.231554549265975, "learning_rate": 5.667361002353077e-06, "loss": 0.3047, "num_tokens": 75171142.0, "step": 681 }, { "epoch": 5.014760147601476, "grad_norm": 0.8071987884892139, "learning_rate": 5.6565680277935355e-06, "loss": 0.3119, "num_tokens": 75284705.0, "step": 682 }, { "epoch": 5.022140221402214, "grad_norm": 0.7519543262012047, "learning_rate": 5.6457741514068055e-06, "loss": 0.3127, "num_tokens": 75398261.0, "step": 683 }, { "epoch": 5.029520295202952, "grad_norm": 0.7647598207944065, "learning_rate": 5.6349794353652934e-06, "loss": 0.3114, "num_tokens": 75476769.0, "step": 684 }, { "epoch": 5.03690036900369, "grad_norm": 0.7928624379185287, "learning_rate": 5.624183941846243e-06, "loss": 0.3665, "num_tokens": 75608269.0, "step": 685 }, { "epoch": 5.044280442804428, "grad_norm": 0.838843344533878, "learning_rate": 5.6133877330313756e-06, "loss": 0.2534, "num_tokens": 75724961.0, "step": 686 }, { "epoch": 5.051660516605166, "grad_norm": 1.0923805020857955, "learning_rate": 5.6025908711065355e-06, "loss": 0.2563, "num_tokens": 75808372.0, "step": 687 }, { "epoch": 5.059040590405904, "grad_norm": 0.8211586325579034, "learning_rate": 5.591793418261326e-06, "loss": 0.2444, "num_tokens": 75901148.0, "step": 688 }, { "epoch": 5.0664206642066425, "grad_norm": 0.9794952627885642, "learning_rate": 5.580995436688752e-06, "loss": 0.2869, "num_tokens": 76023798.0, "step": 689 }, { "epoch": 5.07380073800738, "grad_norm": 0.7968572551605058, "learning_rate": 5.570196988584867e-06, "loss": 0.246, "num_tokens": 76127369.0, "step": 690 }, { "epoch": 5.081180811808118, "grad_norm": 0.8498337469106462, "learning_rate": 5.559398136148416e-06, "loss": 0.3446, "num_tokens": 76212514.0, "step": 691 }, { "epoch": 5.088560885608856, "grad_norm": 0.8863003457661772, "learning_rate": 5.548598941580464e-06, "loss": 0.3075, "num_tokens": 76315534.0, "step": 692 }, { "epoch": 5.095940959409594, "grad_norm": 0.6821618701141282, "learning_rate": 5.537799467084051e-06, "loss": 0.7477, "num_tokens": 76432951.0, "step": 693 }, { "epoch": 5.1033210332103325, "grad_norm": 0.7758856845828853, "learning_rate": 5.526999774863831e-06, "loss": 0.3442, "num_tokens": 76575911.0, "step": 694 }, { "epoch": 5.11070110701107, "grad_norm": 0.864538306122709, "learning_rate": 5.516199927125711e-06, "loss": 0.326, "num_tokens": 76685085.0, "step": 695 }, { "epoch": 5.118081180811808, "grad_norm": 0.6841879390249027, "learning_rate": 5.505399986076491e-06, "loss": 0.3219, "num_tokens": 76821158.0, "step": 696 }, { "epoch": 5.125461254612546, "grad_norm": 0.763740855553643, "learning_rate": 5.49460001392351e-06, "loss": 0.341, "num_tokens": 76930269.0, "step": 697 }, { "epoch": 5.132841328413284, "grad_norm": 0.7570755487804708, "learning_rate": 5.48380007287429e-06, "loss": 0.2881, "num_tokens": 77056550.0, "step": 698 }, { "epoch": 5.1402214022140225, "grad_norm": 0.8240156667031876, "learning_rate": 5.47300022513617e-06, "loss": 0.3072, "num_tokens": 77145852.0, "step": 699 }, { "epoch": 5.14760147601476, "grad_norm": 0.784008083171791, "learning_rate": 5.462200532915951e-06, "loss": 0.3501, "num_tokens": 77308212.0, "step": 700 }, { "epoch": 5.154981549815498, "grad_norm": 0.7108287634133911, "learning_rate": 5.451401058419537e-06, "loss": 0.2905, "num_tokens": 77423799.0, "step": 701 }, { "epoch": 5.162361623616236, "grad_norm": 0.7883814815017057, "learning_rate": 5.4406018638515855e-06, "loss": 0.3394, "num_tokens": 77546225.0, "step": 702 }, { "epoch": 5.169741697416974, "grad_norm": 0.8230808734474867, "learning_rate": 5.4298030114151335e-06, "loss": 0.2824, "num_tokens": 77660108.0, "step": 703 }, { "epoch": 5.177121771217712, "grad_norm": 0.7732749264696992, "learning_rate": 5.4190045633112506e-06, "loss": 0.2962, "num_tokens": 77779396.0, "step": 704 }, { "epoch": 5.18450184501845, "grad_norm": 0.7020821068054012, "learning_rate": 5.408206581738677e-06, "loss": 0.33, "num_tokens": 77891837.0, "step": 705 }, { "epoch": 5.191881918819188, "grad_norm": 0.7757566830553919, "learning_rate": 5.397409128893465e-06, "loss": 0.2819, "num_tokens": 77994807.0, "step": 706 }, { "epoch": 5.199261992619927, "grad_norm": 0.7471023211265383, "learning_rate": 5.386612266968625e-06, "loss": 0.2987, "num_tokens": 78101325.0, "step": 707 }, { "epoch": 5.206642066420664, "grad_norm": 0.7249487534138149, "learning_rate": 5.375816058153759e-06, "loss": 0.3243, "num_tokens": 78222337.0, "step": 708 }, { "epoch": 5.214022140221402, "grad_norm": 0.6787659618795301, "learning_rate": 5.365020564634709e-06, "loss": 0.3288, "num_tokens": 78353270.0, "step": 709 }, { "epoch": 5.22140221402214, "grad_norm": 0.8629790416290948, "learning_rate": 5.354225848593197e-06, "loss": 0.2714, "num_tokens": 78494800.0, "step": 710 }, { "epoch": 5.228782287822878, "grad_norm": 0.6892763943645038, "learning_rate": 5.343431972206467e-06, "loss": 0.3069, "num_tokens": 78649799.0, "step": 711 }, { "epoch": 5.236162361623617, "grad_norm": 0.6629490118039914, "learning_rate": 5.332638997646928e-06, "loss": 0.3256, "num_tokens": 78753962.0, "step": 712 }, { "epoch": 5.243542435424354, "grad_norm": 0.8394640967851109, "learning_rate": 5.321846987081789e-06, "loss": 0.3206, "num_tokens": 78867337.0, "step": 713 }, { "epoch": 5.250922509225092, "grad_norm": 0.8567789895287279, "learning_rate": 5.311056002672712e-06, "loss": 0.289, "num_tokens": 78941648.0, "step": 714 }, { "epoch": 5.25830258302583, "grad_norm": 0.828870401236965, "learning_rate": 5.300266106575449e-06, "loss": 0.258, "num_tokens": 79044888.0, "step": 715 }, { "epoch": 5.265682656826568, "grad_norm": 0.7217389292988418, "learning_rate": 5.28947736093948e-06, "loss": 0.2869, "num_tokens": 79161450.0, "step": 716 }, { "epoch": 5.273062730627307, "grad_norm": 0.8522635118245224, "learning_rate": 5.278689827907658e-06, "loss": 0.2746, "num_tokens": 79245810.0, "step": 717 }, { "epoch": 5.280442804428044, "grad_norm": 0.7727771460862353, "learning_rate": 5.2679035696158545e-06, "loss": 0.2674, "num_tokens": 79363746.0, "step": 718 }, { "epoch": 5.287822878228782, "grad_norm": 0.8199060030313494, "learning_rate": 5.257118648192595e-06, "loss": 0.2579, "num_tokens": 79473017.0, "step": 719 }, { "epoch": 5.29520295202952, "grad_norm": 0.6251600014266541, "learning_rate": 5.246335125758708e-06, "loss": 0.2522, "num_tokens": 79579974.0, "step": 720 }, { "epoch": 5.302583025830258, "grad_norm": 0.7856376981624948, "learning_rate": 5.235553064426962e-06, "loss": 0.3232, "num_tokens": 79703574.0, "step": 721 }, { "epoch": 5.3099630996309966, "grad_norm": 0.9035725837038168, "learning_rate": 5.224772526301709e-06, "loss": 0.2593, "num_tokens": 79783101.0, "step": 722 }, { "epoch": 5.317343173431734, "grad_norm": 0.7035504175759855, "learning_rate": 5.2139935734785286e-06, "loss": 0.2619, "num_tokens": 79919150.0, "step": 723 }, { "epoch": 5.324723247232472, "grad_norm": 0.6941951970579242, "learning_rate": 5.203216268043871e-06, "loss": 0.2558, "num_tokens": 80013775.0, "step": 724 }, { "epoch": 5.332103321033211, "grad_norm": 0.7962220136583572, "learning_rate": 5.1924406720746945e-06, "loss": 0.2883, "num_tokens": 80096300.0, "step": 725 }, { "epoch": 5.339483394833948, "grad_norm": 0.7074533354205865, "learning_rate": 5.18166684763811e-06, "loss": 0.2714, "num_tokens": 80222399.0, "step": 726 }, { "epoch": 5.3468634686346865, "grad_norm": 0.8425504298670053, "learning_rate": 5.170894856791029e-06, "loss": 0.2963, "num_tokens": 80303214.0, "step": 727 }, { "epoch": 5.354243542435424, "grad_norm": 0.8349389571836422, "learning_rate": 5.160124761579795e-06, "loss": 0.2792, "num_tokens": 80403735.0, "step": 728 }, { "epoch": 5.361623616236162, "grad_norm": 0.8123470866862423, "learning_rate": 5.149356624039841e-06, "loss": 0.3166, "num_tokens": 80502906.0, "step": 729 }, { "epoch": 5.369003690036901, "grad_norm": 0.767130869184814, "learning_rate": 5.138590506195317e-06, "loss": 0.2721, "num_tokens": 80599662.0, "step": 730 }, { "epoch": 5.376383763837638, "grad_norm": 0.8751480885508618, "learning_rate": 5.1278264700587425e-06, "loss": 0.2788, "num_tokens": 80696350.0, "step": 731 }, { "epoch": 5.3837638376383765, "grad_norm": 0.6445966908177907, "learning_rate": 5.1170645776306425e-06, "loss": 0.3166, "num_tokens": 80812184.0, "step": 732 }, { "epoch": 5.391143911439114, "grad_norm": 0.7204141796556899, "learning_rate": 5.106304890899203e-06, "loss": 0.2975, "num_tokens": 80936451.0, "step": 733 }, { "epoch": 5.398523985239852, "grad_norm": 0.8073567323623798, "learning_rate": 5.095547471839899e-06, "loss": 0.2563, "num_tokens": 81034128.0, "step": 734 }, { "epoch": 5.405904059040591, "grad_norm": 0.6714427753918635, "learning_rate": 5.084792382415142e-06, "loss": 0.2291, "num_tokens": 81138682.0, "step": 735 }, { "epoch": 5.413284132841328, "grad_norm": 0.6240699238325595, "learning_rate": 5.0740396845739305e-06, "loss": 0.2665, "num_tokens": 81260963.0, "step": 736 }, { "epoch": 5.4206642066420665, "grad_norm": 0.7867412356008463, "learning_rate": 5.063289440251481e-06, "loss": 0.2566, "num_tokens": 81375587.0, "step": 737 }, { "epoch": 5.428044280442805, "grad_norm": 0.7584654734941334, "learning_rate": 5.052541711368886e-06, "loss": 0.2643, "num_tokens": 81484438.0, "step": 738 }, { "epoch": 5.435424354243542, "grad_norm": 0.7205599986317057, "learning_rate": 5.041796559832742e-06, "loss": 0.2732, "num_tokens": 81596696.0, "step": 739 }, { "epoch": 5.442804428044281, "grad_norm": 0.7574253228099226, "learning_rate": 5.031054047534801e-06, "loss": 0.3404, "num_tokens": 81692711.0, "step": 740 }, { "epoch": 5.450184501845018, "grad_norm": 0.7947677794363449, "learning_rate": 5.02031423635162e-06, "loss": 0.3239, "num_tokens": 81792454.0, "step": 741 }, { "epoch": 5.4575645756457565, "grad_norm": 0.7271884820294199, "learning_rate": 5.009577188144188e-06, "loss": 0.291, "num_tokens": 81915318.0, "step": 742 }, { "epoch": 5.464944649446495, "grad_norm": 0.760473112873417, "learning_rate": 4.998842964757585e-06, "loss": 0.3263, "num_tokens": 82033948.0, "step": 743 }, { "epoch": 5.472324723247232, "grad_norm": 0.7893622977330518, "learning_rate": 4.98811162802062e-06, "loss": 0.3152, "num_tokens": 82179793.0, "step": 744 }, { "epoch": 5.479704797047971, "grad_norm": 0.7019465607170431, "learning_rate": 4.977383239745473e-06, "loss": 0.3124, "num_tokens": 82289619.0, "step": 745 }, { "epoch": 5.487084870848708, "grad_norm": 0.8250003474119553, "learning_rate": 4.9666578617273385e-06, "loss": 0.3216, "num_tokens": 82406037.0, "step": 746 }, { "epoch": 5.4944649446494465, "grad_norm": 0.6795317303820285, "learning_rate": 4.95593555574408e-06, "loss": 0.2955, "num_tokens": 82520422.0, "step": 747 }, { "epoch": 5.501845018450185, "grad_norm": 0.8364977112949799, "learning_rate": 4.945216383555861e-06, "loss": 0.3216, "num_tokens": 82610471.0, "step": 748 }, { "epoch": 5.509225092250922, "grad_norm": 0.6878825020678628, "learning_rate": 4.934500406904791e-06, "loss": 0.2703, "num_tokens": 82728500.0, "step": 749 }, { "epoch": 5.516605166051661, "grad_norm": 0.7666698643359626, "learning_rate": 4.923787687514583e-06, "loss": 0.2858, "num_tokens": 82836219.0, "step": 750 }, { "epoch": 5.523985239852399, "grad_norm": 0.8127921752310425, "learning_rate": 4.913078287090179e-06, "loss": 0.2871, "num_tokens": 82940103.0, "step": 751 }, { "epoch": 5.531365313653136, "grad_norm": 0.8054768120172173, "learning_rate": 4.902372267317405e-06, "loss": 0.3016, "num_tokens": 83025222.0, "step": 752 }, { "epoch": 5.538745387453875, "grad_norm": 0.836243912493921, "learning_rate": 4.891669689862622e-06, "loss": 0.26, "num_tokens": 83156808.0, "step": 753 }, { "epoch": 5.546125461254612, "grad_norm": 0.7176219626456037, "learning_rate": 4.880970616372357e-06, "loss": 0.2925, "num_tokens": 83240173.0, "step": 754 }, { "epoch": 5.553505535055351, "grad_norm": 0.7513808908409143, "learning_rate": 4.8702751084729515e-06, "loss": 0.3037, "num_tokens": 83398963.0, "step": 755 }, { "epoch": 5.560885608856088, "grad_norm": 0.7860052138199383, "learning_rate": 4.859583227770218e-06, "loss": 0.2865, "num_tokens": 83501594.0, "step": 756 }, { "epoch": 5.568265682656826, "grad_norm": 0.7064769415440794, "learning_rate": 4.848895035849069e-06, "loss": 0.7912, "num_tokens": 83620892.0, "step": 757 }, { "epoch": 5.575645756457565, "grad_norm": 0.7332008728932892, "learning_rate": 4.838210594273173e-06, "loss": 0.2481, "num_tokens": 83724229.0, "step": 758 }, { "epoch": 5.583025830258302, "grad_norm": 0.7789765211913474, "learning_rate": 4.827529964584597e-06, "loss": 0.2908, "num_tokens": 83815219.0, "step": 759 }, { "epoch": 5.590405904059041, "grad_norm": 0.8722227692511327, "learning_rate": 4.816853208303451e-06, "loss": 0.3432, "num_tokens": 83955057.0, "step": 760 }, { "epoch": 5.597785977859779, "grad_norm": 0.6936633739164829, "learning_rate": 4.8061803869275346e-06, "loss": 0.3135, "num_tokens": 84063496.0, "step": 761 }, { "epoch": 5.605166051660516, "grad_norm": 0.7446073915999715, "learning_rate": 4.795511561931979e-06, "loss": 0.3309, "num_tokens": 84194864.0, "step": 762 }, { "epoch": 5.612546125461255, "grad_norm": 0.7682353671073113, "learning_rate": 4.784846794768901e-06, "loss": 0.3139, "num_tokens": 84347886.0, "step": 763 }, { "epoch": 5.619926199261993, "grad_norm": 0.8300487958678651, "learning_rate": 4.7741861468670436e-06, "loss": 0.3285, "num_tokens": 84486118.0, "step": 764 }, { "epoch": 5.627306273062731, "grad_norm": 0.7269550459131697, "learning_rate": 4.76352967963142e-06, "loss": 0.3205, "num_tokens": 84602385.0, "step": 765 }, { "epoch": 5.634686346863469, "grad_norm": 0.8167497040920041, "learning_rate": 4.752877454442965e-06, "loss": 0.2868, "num_tokens": 84735813.0, "step": 766 }, { "epoch": 5.642066420664206, "grad_norm": 0.8206917156476586, "learning_rate": 4.742229532658181e-06, "loss": 0.2631, "num_tokens": 84820136.0, "step": 767 }, { "epoch": 5.649446494464945, "grad_norm": 0.7621364635308342, "learning_rate": 4.731585975608781e-06, "loss": 0.311, "num_tokens": 84962252.0, "step": 768 }, { "epoch": 5.656826568265682, "grad_norm": 0.838977616803826, "learning_rate": 4.7209468446013376e-06, "loss": 0.2549, "num_tokens": 85036308.0, "step": 769 }, { "epoch": 5.6642066420664205, "grad_norm": 0.6747749021099678, "learning_rate": 4.710312200916929e-06, "loss": 0.3036, "num_tokens": 85154854.0, "step": 770 }, { "epoch": 5.671586715867159, "grad_norm": 0.7989285937342706, "learning_rate": 4.699682105810786e-06, "loss": 0.2443, "num_tokens": 85280088.0, "step": 771 }, { "epoch": 5.678966789667896, "grad_norm": 0.7371279613804671, "learning_rate": 4.68905662051194e-06, "loss": 0.2769, "num_tokens": 85357886.0, "step": 772 }, { "epoch": 5.686346863468635, "grad_norm": 0.7841880701337192, "learning_rate": 4.678435806222873e-06, "loss": 0.2655, "num_tokens": 85472569.0, "step": 773 }, { "epoch": 5.693726937269373, "grad_norm": 0.670554410834585, "learning_rate": 4.667819724119159e-06, "loss": 0.26, "num_tokens": 85591197.0, "step": 774 }, { "epoch": 5.7011070110701105, "grad_norm": 0.7609022381843286, "learning_rate": 4.657208435349114e-06, "loss": 0.2163, "num_tokens": 85685049.0, "step": 775 }, { "epoch": 5.708487084870849, "grad_norm": 0.7361212837503709, "learning_rate": 4.64660200103345e-06, "loss": 0.2669, "num_tokens": 85791889.0, "step": 776 }, { "epoch": 5.715867158671586, "grad_norm": 0.8126259109165639, "learning_rate": 4.63600048226491e-06, "loss": 0.3047, "num_tokens": 85873668.0, "step": 777 }, { "epoch": 5.723247232472325, "grad_norm": 0.7536847458523733, "learning_rate": 4.625403940107929e-06, "loss": 0.6545, "num_tokens": 85977141.0, "step": 778 }, { "epoch": 5.730627306273063, "grad_norm": 0.8390122703405435, "learning_rate": 4.614812435598275e-06, "loss": 0.2571, "num_tokens": 86100651.0, "step": 779 }, { "epoch": 5.7380073800738005, "grad_norm": 0.7184304402233668, "learning_rate": 4.604226029742697e-06, "loss": 0.244, "num_tokens": 86208137.0, "step": 780 }, { "epoch": 5.745387453874539, "grad_norm": 0.7598391769774648, "learning_rate": 4.593644783518581e-06, "loss": 0.2557, "num_tokens": 86318101.0, "step": 781 }, { "epoch": 5.752767527675276, "grad_norm": 0.7065705531981052, "learning_rate": 4.58306875787359e-06, "loss": 0.2954, "num_tokens": 86469562.0, "step": 782 }, { "epoch": 5.760147601476015, "grad_norm": 0.6639915141423608, "learning_rate": 4.572498013725319e-06, "loss": 0.2974, "num_tokens": 86572873.0, "step": 783 }, { "epoch": 5.767527675276753, "grad_norm": 0.8612338852207884, "learning_rate": 4.561932611960935e-06, "loss": 0.3595, "num_tokens": 86705910.0, "step": 784 }, { "epoch": 5.7749077490774905, "grad_norm": 0.7365204739437818, "learning_rate": 4.551372613436845e-06, "loss": 0.2591, "num_tokens": 86806358.0, "step": 785 }, { "epoch": 5.782287822878229, "grad_norm": 0.7811812979489162, "learning_rate": 4.540818078978324e-06, "loss": 0.3246, "num_tokens": 86898194.0, "step": 786 }, { "epoch": 5.789667896678967, "grad_norm": 0.8300575223005775, "learning_rate": 4.5302690693791785e-06, "loss": 0.2407, "num_tokens": 87006345.0, "step": 787 }, { "epoch": 5.797047970479705, "grad_norm": 0.8086907029628351, "learning_rate": 4.519725645401387e-06, "loss": 0.2785, "num_tokens": 87117120.0, "step": 788 }, { "epoch": 5.804428044280443, "grad_norm": 0.6746765611467019, "learning_rate": 4.509187867774762e-06, "loss": 0.5363, "num_tokens": 87251836.0, "step": 789 }, { "epoch": 5.8118081180811805, "grad_norm": 0.8287931036381043, "learning_rate": 4.4986557971965865e-06, "loss": 0.2669, "num_tokens": 87357495.0, "step": 790 }, { "epoch": 5.819188191881919, "grad_norm": 0.6706419685085858, "learning_rate": 4.488129494331276e-06, "loss": 0.2355, "num_tokens": 87446459.0, "step": 791 }, { "epoch": 5.826568265682657, "grad_norm": 0.786872973851237, "learning_rate": 4.477609019810022e-06, "loss": 0.2984, "num_tokens": 87549420.0, "step": 792 }, { "epoch": 5.833948339483395, "grad_norm": 0.7754342944870574, "learning_rate": 4.467094434230445e-06, "loss": 0.2939, "num_tokens": 87646947.0, "step": 793 }, { "epoch": 5.841328413284133, "grad_norm": 0.8200545823833467, "learning_rate": 4.456585798156246e-06, "loss": 0.2938, "num_tokens": 87764542.0, "step": 794 }, { "epoch": 5.8487084870848705, "grad_norm": 0.7562355763532292, "learning_rate": 4.446083172116858e-06, "loss": 0.302, "num_tokens": 87872220.0, "step": 795 }, { "epoch": 5.856088560885609, "grad_norm": 0.9046034705903128, "learning_rate": 4.435586616607094e-06, "loss": 0.3068, "num_tokens": 87971032.0, "step": 796 }, { "epoch": 5.863468634686347, "grad_norm": 0.7704842545134923, "learning_rate": 4.4250961920868005e-06, "loss": 0.2637, "num_tokens": 88064195.0, "step": 797 }, { "epoch": 5.870848708487085, "grad_norm": 0.7742359193055628, "learning_rate": 4.414611958980512e-06, "loss": 0.2681, "num_tokens": 88172050.0, "step": 798 }, { "epoch": 5.878228782287823, "grad_norm": 0.7056198246854335, "learning_rate": 4.404133977677101e-06, "loss": 0.2496, "num_tokens": 88300724.0, "step": 799 }, { "epoch": 5.885608856088561, "grad_norm": 0.7125491273298457, "learning_rate": 4.393662308529427e-06, "loss": 0.2427, "num_tokens": 88431435.0, "step": 800 }, { "epoch": 5.892988929889299, "grad_norm": 0.6511055726535395, "learning_rate": 4.383197011853993e-06, "loss": 0.3121, "num_tokens": 88549593.0, "step": 801 }, { "epoch": 5.900369003690037, "grad_norm": 0.9074280458375835, "learning_rate": 4.372738147930599e-06, "loss": 0.2847, "num_tokens": 88643972.0, "step": 802 }, { "epoch": 5.907749077490775, "grad_norm": 0.7638665779652982, "learning_rate": 4.362285777001989e-06, "loss": 0.3132, "num_tokens": 88769522.0, "step": 803 }, { "epoch": 5.915129151291513, "grad_norm": 0.8305226024562709, "learning_rate": 4.35183995927351e-06, "loss": 0.2635, "num_tokens": 88859548.0, "step": 804 }, { "epoch": 5.922509225092251, "grad_norm": 0.9231553389167491, "learning_rate": 4.34140075491276e-06, "loss": 0.3237, "num_tokens": 88951297.0, "step": 805 }, { "epoch": 5.929889298892989, "grad_norm": 0.6204930789359487, "learning_rate": 4.330968224049248e-06, "loss": 0.3277, "num_tokens": 89077101.0, "step": 806 }, { "epoch": 5.937269372693727, "grad_norm": 0.7903970100872257, "learning_rate": 4.320542426774042e-06, "loss": 0.2492, "num_tokens": 89162461.0, "step": 807 }, { "epoch": 5.944649446494465, "grad_norm": 0.7374651281826408, "learning_rate": 4.310123423139422e-06, "loss": 0.3697, "num_tokens": 89267370.0, "step": 808 }, { "epoch": 5.952029520295203, "grad_norm": 0.6716770325771486, "learning_rate": 4.299711273158542e-06, "loss": 0.2777, "num_tokens": 89408407.0, "step": 809 }, { "epoch": 5.959409594095941, "grad_norm": 0.8587406649423595, "learning_rate": 4.289306036805077e-06, "loss": 0.2774, "num_tokens": 89497445.0, "step": 810 }, { "epoch": 5.966789667896679, "grad_norm": 0.8292847782488489, "learning_rate": 4.278907774012876e-06, "loss": 0.2557, "num_tokens": 89617678.0, "step": 811 }, { "epoch": 5.974169741697417, "grad_norm": 0.7438193152871864, "learning_rate": 4.268516544675628e-06, "loss": 0.2441, "num_tokens": 89714542.0, "step": 812 }, { "epoch": 5.9815498154981555, "grad_norm": 0.6870379095079707, "learning_rate": 4.258132408646503e-06, "loss": 0.2228, "num_tokens": 89823849.0, "step": 813 }, { "epoch": 5.988929889298893, "grad_norm": 0.7013293356784026, "learning_rate": 4.247755425737816e-06, "loss": 0.2735, "num_tokens": 89919377.0, "step": 814 }, { "epoch": 5.996309963099631, "grad_norm": 0.773828181140405, "learning_rate": 4.237385655720681e-06, "loss": 0.3245, "num_tokens": 90038971.0, "step": 815 }, { "epoch": 6.0, "grad_norm": 1.2173223861261109, "learning_rate": 4.227023158324666e-06, "loss": 0.2545, "num_tokens": 90106097.0, "step": 816 }, { "epoch": 6.007380073800738, "grad_norm": 0.754619177850811, "learning_rate": 4.216667993237445e-06, "loss": 0.2196, "num_tokens": 90213239.0, "step": 817 }, { "epoch": 6.014760147601476, "grad_norm": 0.7501441529725348, "learning_rate": 4.206320220104464e-06, "loss": 0.2059, "num_tokens": 90306923.0, "step": 818 }, { "epoch": 6.022140221402214, "grad_norm": 0.823773233978655, "learning_rate": 4.195979898528589e-06, "loss": 0.2554, "num_tokens": 90439883.0, "step": 819 }, { "epoch": 6.029520295202952, "grad_norm": 0.6126983908864717, "learning_rate": 4.185647088069765e-06, "loss": 0.4358, "num_tokens": 90610253.0, "step": 820 }, { "epoch": 6.03690036900369, "grad_norm": 0.8108549742677466, "learning_rate": 4.175321848244673e-06, "loss": 0.1915, "num_tokens": 90701249.0, "step": 821 }, { "epoch": 6.044280442804428, "grad_norm": 0.8821882563133362, "learning_rate": 4.165004238526388e-06, "loss": 0.2422, "num_tokens": 90816090.0, "step": 822 }, { "epoch": 6.051660516605166, "grad_norm": 1.1293556042513082, "learning_rate": 4.1546943183440344e-06, "loss": 0.2124, "num_tokens": 90908059.0, "step": 823 }, { "epoch": 6.059040590405904, "grad_norm": 1.1286542477044417, "learning_rate": 4.144392147082448e-06, "loss": 0.198, "num_tokens": 90988560.0, "step": 824 }, { "epoch": 6.0664206642066425, "grad_norm": 0.9270382141381814, "learning_rate": 4.134097784081826e-06, "loss": 0.1983, "num_tokens": 91093452.0, "step": 825 }, { "epoch": 6.07380073800738, "grad_norm": 0.8295770283434506, "learning_rate": 4.123811288637397e-06, "loss": 0.1738, "num_tokens": 91212659.0, "step": 826 }, { "epoch": 6.081180811808118, "grad_norm": 0.7985117078350946, "learning_rate": 4.113532719999067e-06, "loss": 0.1932, "num_tokens": 91327661.0, "step": 827 }, { "epoch": 6.088560885608856, "grad_norm": 0.7313065345505175, "learning_rate": 4.103262137371087e-06, "loss": 0.2618, "num_tokens": 91460086.0, "step": 828 }, { "epoch": 6.095940959409594, "grad_norm": 0.7559559275244944, "learning_rate": 4.0929995999117085e-06, "loss": 0.2455, "num_tokens": 91569256.0, "step": 829 }, { "epoch": 6.1033210332103325, "grad_norm": 0.8028774258101546, "learning_rate": 4.082745166732842e-06, "loss": 0.2247, "num_tokens": 91675596.0, "step": 830 }, { "epoch": 6.11070110701107, "grad_norm": 0.7499423102410786, "learning_rate": 4.072498896899718e-06, "loss": 0.2294, "num_tokens": 91781549.0, "step": 831 }, { "epoch": 6.118081180811808, "grad_norm": 0.6529902129393093, "learning_rate": 4.062260849430546e-06, "loss": 0.2013, "num_tokens": 91927478.0, "step": 832 }, { "epoch": 6.125461254612546, "grad_norm": 0.6601754149059782, "learning_rate": 4.052031083296175e-06, "loss": 0.1806, "num_tokens": 92041926.0, "step": 833 }, { "epoch": 6.132841328413284, "grad_norm": 0.7415551312116556, "learning_rate": 4.041809657419755e-06, "loss": 0.1921, "num_tokens": 92167319.0, "step": 834 }, { "epoch": 6.1402214022140225, "grad_norm": 0.7601195859487445, "learning_rate": 4.031596630676394e-06, "loss": 0.2371, "num_tokens": 92286885.0, "step": 835 }, { "epoch": 6.14760147601476, "grad_norm": 0.7380371385249106, "learning_rate": 4.021392061892824e-06, "loss": 0.2135, "num_tokens": 92389683.0, "step": 836 }, { "epoch": 6.154981549815498, "grad_norm": 0.7925402829629689, "learning_rate": 4.011196009847057e-06, "loss": 0.1935, "num_tokens": 92496539.0, "step": 837 }, { "epoch": 6.162361623616236, "grad_norm": 0.8074333970201613, "learning_rate": 4.001008533268052e-06, "loss": 0.2198, "num_tokens": 92601836.0, "step": 838 }, { "epoch": 6.169741697416974, "grad_norm": 0.8638114016631451, "learning_rate": 3.990829690835369e-06, "loss": 0.1886, "num_tokens": 92693125.0, "step": 839 }, { "epoch": 6.177121771217712, "grad_norm": 0.9153865801186486, "learning_rate": 3.980659541178842e-06, "loss": 0.2108, "num_tokens": 92778023.0, "step": 840 }, { "epoch": 6.18450184501845, "grad_norm": 0.711842304748427, "learning_rate": 3.970498142878226e-06, "loss": 0.1809, "num_tokens": 92897683.0, "step": 841 }, { "epoch": 6.191881918819188, "grad_norm": 0.6796198153078575, "learning_rate": 3.9603455544628754e-06, "loss": 0.1942, "num_tokens": 93011980.0, "step": 842 }, { "epoch": 6.199261992619927, "grad_norm": 0.7385851732118687, "learning_rate": 3.9502018344113975e-06, "loss": 0.2006, "num_tokens": 93136769.0, "step": 843 }, { "epoch": 6.206642066420664, "grad_norm": 0.7724145049291772, "learning_rate": 3.94006704115132e-06, "loss": 0.2243, "num_tokens": 93233152.0, "step": 844 }, { "epoch": 6.214022140221402, "grad_norm": 0.8193526198793998, "learning_rate": 3.9299412330587504e-06, "loss": 0.2172, "num_tokens": 93337100.0, "step": 845 }, { "epoch": 6.22140221402214, "grad_norm": 0.8250040421131153, "learning_rate": 3.919824468458041e-06, "loss": 0.2384, "num_tokens": 93501961.0, "step": 846 }, { "epoch": 6.228782287822878, "grad_norm": 0.743900475287342, "learning_rate": 3.909716805621459e-06, "loss": 0.2542, "num_tokens": 93634298.0, "step": 847 }, { "epoch": 6.236162361623617, "grad_norm": 0.7646462253424315, "learning_rate": 3.899618302768837e-06, "loss": 0.2076, "num_tokens": 93746723.0, "step": 848 }, { "epoch": 6.243542435424354, "grad_norm": 0.7762869644495579, "learning_rate": 3.889529018067256e-06, "loss": 0.7056, "num_tokens": 93884297.0, "step": 849 }, { "epoch": 6.250922509225092, "grad_norm": 0.80931274189963, "learning_rate": 3.879449009630694e-06, "loss": 0.2629, "num_tokens": 93990776.0, "step": 850 }, { "epoch": 6.25830258302583, "grad_norm": 0.6833787978478929, "learning_rate": 3.869378335519701e-06, "loss": 0.2117, "num_tokens": 94117404.0, "step": 851 }, { "epoch": 6.265682656826568, "grad_norm": 0.7413250935105922, "learning_rate": 3.8593170537410585e-06, "loss": 0.1748, "num_tokens": 94213518.0, "step": 852 }, { "epoch": 6.273062730627307, "grad_norm": 0.7159824628287972, "learning_rate": 3.849265222247452e-06, "loss": 0.1929, "num_tokens": 94334198.0, "step": 853 }, { "epoch": 6.280442804428044, "grad_norm": 0.8765176099031716, "learning_rate": 3.839222898937136e-06, "loss": 0.212, "num_tokens": 94421654.0, "step": 854 }, { "epoch": 6.287822878228782, "grad_norm": 0.6509756542972459, "learning_rate": 3.8291901416535895e-06, "loss": 0.1778, "num_tokens": 94534406.0, "step": 855 }, { "epoch": 6.29520295202952, "grad_norm": 0.8400790274459855, "learning_rate": 3.819167008185198e-06, "loss": 0.2379, "num_tokens": 94652173.0, "step": 856 }, { "epoch": 6.302583025830258, "grad_norm": 0.9179393571701614, "learning_rate": 3.809153556264914e-06, "loss": 0.2516, "num_tokens": 94749142.0, "step": 857 }, { "epoch": 6.3099630996309966, "grad_norm": 0.8240154718085584, "learning_rate": 3.7991498435699213e-06, "loss": 0.4507, "num_tokens": 94853046.0, "step": 858 }, { "epoch": 6.317343173431734, "grad_norm": 0.7493697083757599, "learning_rate": 3.7891559277213095e-06, "loss": 0.1833, "num_tokens": 94945423.0, "step": 859 }, { "epoch": 6.324723247232472, "grad_norm": 0.990775891335437, "learning_rate": 3.779171866283734e-06, "loss": 0.2106, "num_tokens": 95038518.0, "step": 860 }, { "epoch": 6.332103321033211, "grad_norm": 0.8525222926707304, "learning_rate": 3.7691977167650952e-06, "loss": 0.2581, "num_tokens": 95160151.0, "step": 861 }, { "epoch": 6.339483394833948, "grad_norm": 0.6823449059636649, "learning_rate": 3.759233536616197e-06, "loss": 0.2158, "num_tokens": 95275795.0, "step": 862 }, { "epoch": 6.3468634686346865, "grad_norm": 0.7102219478305801, "learning_rate": 3.749279383230421e-06, "loss": 0.2193, "num_tokens": 95392967.0, "step": 863 }, { "epoch": 6.354243542435424, "grad_norm": 0.7387604183406936, "learning_rate": 3.7393353139433952e-06, "loss": 0.2265, "num_tokens": 95514701.0, "step": 864 }, { "epoch": 6.361623616236162, "grad_norm": 0.7600271999176372, "learning_rate": 3.729401386032663e-06, "loss": 0.2192, "num_tokens": 95617043.0, "step": 865 }, { "epoch": 6.369003690036901, "grad_norm": 0.8308997001060889, "learning_rate": 3.719477656717355e-06, "loss": 0.1975, "num_tokens": 95703514.0, "step": 866 }, { "epoch": 6.376383763837638, "grad_norm": 0.7156260219335532, "learning_rate": 3.7095641831578567e-06, "loss": 0.2035, "num_tokens": 95812288.0, "step": 867 }, { "epoch": 6.3837638376383765, "grad_norm": 0.8875140819742275, "learning_rate": 3.699661022455482e-06, "loss": 0.1898, "num_tokens": 95901945.0, "step": 868 }, { "epoch": 6.391143911439114, "grad_norm": 0.6696637884259364, "learning_rate": 3.689768231652141e-06, "loss": 0.2058, "num_tokens": 96049902.0, "step": 869 }, { "epoch": 6.398523985239852, "grad_norm": 0.7905426177757932, "learning_rate": 3.6798858677300143e-06, "loss": 0.1587, "num_tokens": 96153520.0, "step": 870 }, { "epoch": 6.405904059040591, "grad_norm": 0.8013809496278096, "learning_rate": 3.670013987611226e-06, "loss": 0.1917, "num_tokens": 96252026.0, "step": 871 }, { "epoch": 6.413284132841328, "grad_norm": 0.8162319784741491, "learning_rate": 3.6601526481575133e-06, "loss": 0.2048, "num_tokens": 96348710.0, "step": 872 }, { "epoch": 6.4206642066420665, "grad_norm": 0.7328529544115915, "learning_rate": 3.650301906169896e-06, "loss": 0.1907, "num_tokens": 96440569.0, "step": 873 }, { "epoch": 6.428044280442805, "grad_norm": 0.8296301704513374, "learning_rate": 3.640461818388359e-06, "loss": 0.1968, "num_tokens": 96532079.0, "step": 874 }, { "epoch": 6.435424354243542, "grad_norm": 0.6444682162510676, "learning_rate": 3.630632441491512e-06, "loss": 0.1823, "num_tokens": 96643844.0, "step": 875 }, { "epoch": 6.442804428044281, "grad_norm": 0.744390482720809, "learning_rate": 3.620813832096275e-06, "loss": 0.2158, "num_tokens": 96758044.0, "step": 876 }, { "epoch": 6.450184501845018, "grad_norm": 0.7123681226464863, "learning_rate": 3.611006046757547e-06, "loss": 0.2033, "num_tokens": 96862371.0, "step": 877 }, { "epoch": 6.4575645756457565, "grad_norm": 0.7526500553085586, "learning_rate": 3.6012091419678808e-06, "loss": 0.1947, "num_tokens": 96981106.0, "step": 878 }, { "epoch": 6.464944649446495, "grad_norm": 0.7614999330943449, "learning_rate": 3.591423174157154e-06, "loss": 0.1645, "num_tokens": 97107831.0, "step": 879 }, { "epoch": 6.472324723247232, "grad_norm": 0.7011776983112978, "learning_rate": 3.581648199692255e-06, "loss": 0.2517, "num_tokens": 97246067.0, "step": 880 }, { "epoch": 6.479704797047971, "grad_norm": 0.6553053730395717, "learning_rate": 3.5718842748767447e-06, "loss": 0.2208, "num_tokens": 97385229.0, "step": 881 }, { "epoch": 6.487084870848708, "grad_norm": 0.8167144132408032, "learning_rate": 3.5621314559505383e-06, "loss": 0.2525, "num_tokens": 97528757.0, "step": 882 }, { "epoch": 6.4944649446494465, "grad_norm": 0.7398027869252124, "learning_rate": 3.552389799089584e-06, "loss": 0.1958, "num_tokens": 97642105.0, "step": 883 }, { "epoch": 6.501845018450185, "grad_norm": 0.8569939875013843, "learning_rate": 3.542659360405537e-06, "loss": 0.2649, "num_tokens": 97742503.0, "step": 884 }, { "epoch": 6.509225092250922, "grad_norm": 0.6510247379643058, "learning_rate": 3.5329401959454348e-06, "loss": 0.152, "num_tokens": 97849892.0, "step": 885 }, { "epoch": 6.516605166051661, "grad_norm": 0.8610923117709987, "learning_rate": 3.5232323616913745e-06, "loss": 0.208, "num_tokens": 97943605.0, "step": 886 }, { "epoch": 6.523985239852399, "grad_norm": 0.786724364976123, "learning_rate": 3.513535913560194e-06, "loss": 0.1973, "num_tokens": 98032889.0, "step": 887 }, { "epoch": 6.531365313653136, "grad_norm": 0.6947759355032901, "learning_rate": 3.5038509074031444e-06, "loss": 0.1906, "num_tokens": 98150384.0, "step": 888 }, { "epoch": 6.538745387453875, "grad_norm": 0.6696060626479275, "learning_rate": 3.4941773990055784e-06, "loss": 0.2248, "num_tokens": 98287503.0, "step": 889 }, { "epoch": 6.546125461254612, "grad_norm": 0.7507830986698222, "learning_rate": 3.4845154440866137e-06, "loss": 0.1848, "num_tokens": 98381090.0, "step": 890 }, { "epoch": 6.553505535055351, "grad_norm": 0.7553275368445391, "learning_rate": 3.4748650982988245e-06, "loss": 0.2097, "num_tokens": 98516065.0, "step": 891 }, { "epoch": 6.560885608856088, "grad_norm": 0.8156256331640636, "learning_rate": 3.4652264172279153e-06, "loss": 0.178, "num_tokens": 98601146.0, "step": 892 }, { "epoch": 6.568265682656826, "grad_norm": 0.7603646969200515, "learning_rate": 3.4555994563924034e-06, "loss": 0.2084, "num_tokens": 98721470.0, "step": 893 }, { "epoch": 6.575645756457565, "grad_norm": 0.7577945918254975, "learning_rate": 3.4459842712432957e-06, "loss": 0.1636, "num_tokens": 98820078.0, "step": 894 }, { "epoch": 6.583025830258302, "grad_norm": 0.8005240275250651, "learning_rate": 3.436380917163775e-06, "loss": 0.2031, "num_tokens": 98943791.0, "step": 895 }, { "epoch": 6.590405904059041, "grad_norm": 0.799801084880718, "learning_rate": 3.4267894494688735e-06, "loss": 0.2296, "num_tokens": 99082296.0, "step": 896 }, { "epoch": 6.597785977859779, "grad_norm": 0.7522478657943151, "learning_rate": 3.417209923405163e-06, "loss": 0.2294, "num_tokens": 99223930.0, "step": 897 }, { "epoch": 6.605166051660516, "grad_norm": 0.7058009301544409, "learning_rate": 3.407642394150429e-06, "loss": 0.1766, "num_tokens": 99344618.0, "step": 898 }, { "epoch": 6.612546125461255, "grad_norm": 0.8022735323233764, "learning_rate": 3.3980869168133533e-06, "loss": 0.1995, "num_tokens": 99463898.0, "step": 899 }, { "epoch": 6.619926199261993, "grad_norm": 0.7663300241940983, "learning_rate": 3.3885435464332028e-06, "loss": 0.1818, "num_tokens": 99570004.0, "step": 900 }, { "epoch": 6.627306273062731, "grad_norm": 0.7523025442584772, "learning_rate": 3.379012337979507e-06, "loss": 0.1806, "num_tokens": 99684132.0, "step": 901 }, { "epoch": 6.634686346863469, "grad_norm": 0.7324752348611164, "learning_rate": 3.3694933463517443e-06, "loss": 0.2474, "num_tokens": 99805955.0, "step": 902 }, { "epoch": 6.642066420664206, "grad_norm": 0.8812132921472331, "learning_rate": 3.3599866263790227e-06, "loss": 0.2032, "num_tokens": 99902512.0, "step": 903 }, { "epoch": 6.649446494464945, "grad_norm": 0.7770802676786434, "learning_rate": 3.3504922328197675e-06, "loss": 0.1704, "num_tokens": 100013792.0, "step": 904 }, { "epoch": 6.656826568265682, "grad_norm": 0.9755321311222953, "learning_rate": 3.3410102203614024e-06, "loss": 0.2631, "num_tokens": 100119726.0, "step": 905 }, { "epoch": 6.6642066420664205, "grad_norm": 0.6557407401122956, "learning_rate": 3.331540643620039e-06, "loss": 0.1933, "num_tokens": 100245125.0, "step": 906 }, { "epoch": 6.671586715867159, "grad_norm": 0.7389180742131263, "learning_rate": 3.322083557140159e-06, "loss": 0.2223, "num_tokens": 100383345.0, "step": 907 }, { "epoch": 6.678966789667896, "grad_norm": 0.7429765561465446, "learning_rate": 3.3126390153942977e-06, "loss": 0.2308, "num_tokens": 100502064.0, "step": 908 }, { "epoch": 6.686346863468635, "grad_norm": 0.8927199535053768, "learning_rate": 3.3032070727827358e-06, "loss": 0.2295, "num_tokens": 100582734.0, "step": 909 }, { "epoch": 6.693726937269373, "grad_norm": 0.876892615200498, "learning_rate": 3.293787783633182e-06, "loss": 0.1841, "num_tokens": 100673332.0, "step": 910 }, { "epoch": 6.7011070110701105, "grad_norm": 0.6604620029571926, "learning_rate": 3.2843812022004606e-06, "loss": 0.1542, "num_tokens": 100765299.0, "step": 911 }, { "epoch": 6.708487084870849, "grad_norm": 0.7407674652150991, "learning_rate": 3.2749873826662047e-06, "loss": 0.2064, "num_tokens": 100882033.0, "step": 912 }, { "epoch": 6.715867158671586, "grad_norm": 0.86708507778245, "learning_rate": 3.265606379138534e-06, "loss": 0.1649, "num_tokens": 100963233.0, "step": 913 }, { "epoch": 6.723247232472325, "grad_norm": 0.8608589329839527, "learning_rate": 3.2562382456517495e-06, "loss": 0.2295, "num_tokens": 101055152.0, "step": 914 }, { "epoch": 6.730627306273063, "grad_norm": 0.9392155344063237, "learning_rate": 3.246883036166023e-06, "loss": 0.1896, "num_tokens": 101137124.0, "step": 915 }, { "epoch": 6.7380073800738005, "grad_norm": 0.9396451567910546, "learning_rate": 3.2375408045670836e-06, "loss": 0.2353, "num_tokens": 101239694.0, "step": 916 }, { "epoch": 6.745387453874539, "grad_norm": 0.7896628007083686, "learning_rate": 3.228211604665907e-06, "loss": 0.1686, "num_tokens": 101342164.0, "step": 917 }, { "epoch": 6.752767527675276, "grad_norm": 0.8310372293464977, "learning_rate": 3.218895490198407e-06, "loss": 0.2091, "num_tokens": 101448880.0, "step": 918 }, { "epoch": 6.760147601476015, "grad_norm": 0.7450885761579481, "learning_rate": 3.2095925148251273e-06, "loss": 0.1777, "num_tokens": 101546758.0, "step": 919 }, { "epoch": 6.767527675276753, "grad_norm": 0.7199744105820569, "learning_rate": 3.2003027321309287e-06, "loss": 0.151, "num_tokens": 101650068.0, "step": 920 }, { "epoch": 6.7749077490774905, "grad_norm": 0.674624666564349, "learning_rate": 3.1910261956246845e-06, "loss": 0.2322, "num_tokens": 101770709.0, "step": 921 }, { "epoch": 6.782287822878229, "grad_norm": 0.878038648852318, "learning_rate": 3.1817629587389675e-06, "loss": 0.2003, "num_tokens": 101857734.0, "step": 922 }, { "epoch": 6.789667896678967, "grad_norm": 0.7892653869940014, "learning_rate": 3.17251307482975e-06, "loss": 0.3376, "num_tokens": 101956249.0, "step": 923 }, { "epoch": 6.797047970479705, "grad_norm": 0.7905826098236308, "learning_rate": 3.1632765971760875e-06, "loss": 0.2093, "num_tokens": 102044561.0, "step": 924 }, { "epoch": 6.804428044280443, "grad_norm": 0.8891968097952797, "learning_rate": 3.1540535789798168e-06, "loss": 0.2193, "num_tokens": 102137661.0, "step": 925 }, { "epoch": 6.8118081180811805, "grad_norm": 0.7500480325072951, "learning_rate": 3.144844073365247e-06, "loss": 0.2044, "num_tokens": 102247189.0, "step": 926 }, { "epoch": 6.819188191881919, "grad_norm": 0.6365731589988368, "learning_rate": 3.135648133378859e-06, "loss": 0.1924, "num_tokens": 102359037.0, "step": 927 }, { "epoch": 6.826568265682657, "grad_norm": 0.7028378561369589, "learning_rate": 3.126465811988994e-06, "loss": 0.2335, "num_tokens": 102506594.0, "step": 928 }, { "epoch": 6.833948339483395, "grad_norm": 0.7532976280521092, "learning_rate": 3.1172971620855477e-06, "loss": 0.2093, "num_tokens": 102647771.0, "step": 929 }, { "epoch": 6.841328413284133, "grad_norm": 0.734868379726991, "learning_rate": 3.108142236479675e-06, "loss": 0.2066, "num_tokens": 102759707.0, "step": 930 }, { "epoch": 6.8487084870848705, "grad_norm": 0.7295400190719857, "learning_rate": 3.099001087903473e-06, "loss": 0.1763, "num_tokens": 102887449.0, "step": 931 }, { "epoch": 6.856088560885609, "grad_norm": 0.6572139423314011, "learning_rate": 3.0898737690096857e-06, "loss": 0.1564, "num_tokens": 103005173.0, "step": 932 }, { "epoch": 6.863468634686347, "grad_norm": 0.8052758067274763, "learning_rate": 3.080760332371402e-06, "loss": 0.2305, "num_tokens": 103137118.0, "step": 933 }, { "epoch": 6.870848708487085, "grad_norm": 0.7881555315391795, "learning_rate": 3.071660830481743e-06, "loss": 0.4904, "num_tokens": 103270118.0, "step": 934 }, { "epoch": 6.878228782287823, "grad_norm": 0.7603387680036207, "learning_rate": 3.062575315753571e-06, "loss": 0.1851, "num_tokens": 103376975.0, "step": 935 }, { "epoch": 6.885608856088561, "grad_norm": 0.7912312318357577, "learning_rate": 3.0535038405191804e-06, "loss": 0.1821, "num_tokens": 103446424.0, "step": 936 }, { "epoch": 6.892988929889299, "grad_norm": 0.904347558736632, "learning_rate": 3.0444464570299992e-06, "loss": 0.2031, "num_tokens": 103533858.0, "step": 937 }, { "epoch": 6.900369003690037, "grad_norm": 0.730765026740344, "learning_rate": 3.0354032174562864e-06, "loss": 0.1827, "num_tokens": 103624447.0, "step": 938 }, { "epoch": 6.907749077490775, "grad_norm": 0.6950005683139954, "learning_rate": 3.0263741738868348e-06, "loss": 0.1797, "num_tokens": 103739077.0, "step": 939 }, { "epoch": 6.915129151291513, "grad_norm": 0.7767121805718566, "learning_rate": 3.0173593783286644e-06, "loss": 0.2039, "num_tokens": 103838997.0, "step": 940 }, { "epoch": 6.922509225092251, "grad_norm": 0.7312298513437542, "learning_rate": 3.0083588827067334e-06, "loss": 0.1762, "num_tokens": 103934375.0, "step": 941 }, { "epoch": 6.929889298892989, "grad_norm": 0.7258774991287039, "learning_rate": 2.999372738863627e-06, "loss": 0.1941, "num_tokens": 104057875.0, "step": 942 }, { "epoch": 6.937269372693727, "grad_norm": 0.906804533467632, "learning_rate": 2.9904009985592685e-06, "loss": 0.182, "num_tokens": 104149323.0, "step": 943 }, { "epoch": 6.944649446494465, "grad_norm": 0.7906694700640292, "learning_rate": 2.981443713470614e-06, "loss": 0.1898, "num_tokens": 104234856.0, "step": 944 }, { "epoch": 6.952029520295203, "grad_norm": 0.7908183194822428, "learning_rate": 2.972500935191361e-06, "loss": 0.2914, "num_tokens": 104400351.0, "step": 945 }, { "epoch": 6.959409594095941, "grad_norm": 0.782710003770315, "learning_rate": 2.963572715231645e-06, "loss": 0.2288, "num_tokens": 104502341.0, "step": 946 }, { "epoch": 6.966789667896679, "grad_norm": 0.6924748220617756, "learning_rate": 2.9546591050177475e-06, "loss": 0.2123, "num_tokens": 104642099.0, "step": 947 }, { "epoch": 6.974169741697417, "grad_norm": 0.7826865188420538, "learning_rate": 2.9457601558918e-06, "loss": 0.2013, "num_tokens": 104750363.0, "step": 948 }, { "epoch": 6.9815498154981555, "grad_norm": 0.880677527246301, "learning_rate": 2.936875919111485e-06, "loss": 0.1964, "num_tokens": 104834582.0, "step": 949 }, { "epoch": 6.988929889298893, "grad_norm": 0.7502521154320814, "learning_rate": 2.928006445849743e-06, "loss": 0.178, "num_tokens": 104941682.0, "step": 950 }, { "epoch": 6.996309963099631, "grad_norm": 0.7485303984947371, "learning_rate": 2.9191517871944763e-06, "loss": 0.2267, "num_tokens": 105084014.0, "step": 951 }, { "epoch": 7.0, "grad_norm": 0.7485303984947371, "learning_rate": 2.910311994148255e-06, "loss": 0.1983, "num_tokens": 105124687.0, "step": 952 }, { "epoch": 7.007380073800738, "grad_norm": 1.327378191458176, "learning_rate": 2.901487117628025e-06, "loss": 0.1776, "num_tokens": 105230003.0, "step": 953 }, { "epoch": 7.014760147601476, "grad_norm": 0.7273680072948099, "learning_rate": 2.892677208464811e-06, "loss": 0.1327, "num_tokens": 105323072.0, "step": 954 }, { "epoch": 7.022140221402214, "grad_norm": 0.5306741304213602, "learning_rate": 2.8838823174034314e-06, "loss": 0.1744, "num_tokens": 105461293.0, "step": 955 }, { "epoch": 7.029520295202952, "grad_norm": 0.5746012125314613, "learning_rate": 2.8751024951021954e-06, "loss": 0.1332, "num_tokens": 105593715.0, "step": 956 }, { "epoch": 7.03690036900369, "grad_norm": 0.7356406852351963, "learning_rate": 2.866337792132618e-06, "loss": 0.1638, "num_tokens": 105690443.0, "step": 957 }, { "epoch": 7.044280442804428, "grad_norm": 0.6708094431234726, "learning_rate": 2.85758825897913e-06, "loss": 0.1411, "num_tokens": 105804749.0, "step": 958 }, { "epoch": 7.051660516605166, "grad_norm": 0.8330900045498715, "learning_rate": 2.8488539460387822e-06, "loss": 0.1561, "num_tokens": 105918245.0, "step": 959 }, { "epoch": 7.059040590405904, "grad_norm": 0.882272731175345, "learning_rate": 2.8401349036209563e-06, "loss": 0.1712, "num_tokens": 106001812.0, "step": 960 }, { "epoch": 7.0664206642066425, "grad_norm": 1.0611329952976811, "learning_rate": 2.8314311819470786e-06, "loss": 0.1288, "num_tokens": 106096337.0, "step": 961 }, { "epoch": 7.07380073800738, "grad_norm": 0.8757551848240493, "learning_rate": 2.822742831150328e-06, "loss": 0.1456, "num_tokens": 106227709.0, "step": 962 }, { "epoch": 7.081180811808118, "grad_norm": 0.9051974031387492, "learning_rate": 2.814069901275345e-06, "loss": 0.6104, "num_tokens": 106371387.0, "step": 963 }, { "epoch": 7.088560885608856, "grad_norm": 0.747810427910208, "learning_rate": 2.8054124422779495e-06, "loss": 0.1615, "num_tokens": 106480982.0, "step": 964 }, { "epoch": 7.095940959409594, "grad_norm": 0.7570107836205541, "learning_rate": 2.7967705040248467e-06, "loss": 0.1543, "num_tokens": 106591191.0, "step": 965 }, { "epoch": 7.1033210332103325, "grad_norm": 0.7597118970878524, "learning_rate": 2.788144136293347e-06, "loss": 0.133, "num_tokens": 106683305.0, "step": 966 }, { "epoch": 7.11070110701107, "grad_norm": 0.8422954888652188, "learning_rate": 2.779533388771069e-06, "loss": 0.1766, "num_tokens": 106772494.0, "step": 967 }, { "epoch": 7.118081180811808, "grad_norm": 0.7074748344221055, "learning_rate": 2.7709383110556663e-06, "loss": 0.1295, "num_tokens": 106898534.0, "step": 968 }, { "epoch": 7.125461254612546, "grad_norm": 0.5549166268641669, "learning_rate": 2.7623589526545292e-06, "loss": 0.1397, "num_tokens": 107027631.0, "step": 969 }, { "epoch": 7.132841328413284, "grad_norm": 0.696716651502236, "learning_rate": 2.753795362984507e-06, "loss": 0.1261, "num_tokens": 107120375.0, "step": 970 }, { "epoch": 7.1402214022140225, "grad_norm": 0.629510010033181, "learning_rate": 2.745247591371623e-06, "loss": 0.1166, "num_tokens": 107222605.0, "step": 971 }, { "epoch": 7.14760147601476, "grad_norm": 0.6636518088732871, "learning_rate": 2.736715687050787e-06, "loss": 0.1419, "num_tokens": 107314539.0, "step": 972 }, { "epoch": 7.154981549815498, "grad_norm": 0.7553635445094093, "learning_rate": 2.7281996991655147e-06, "loss": 0.1519, "num_tokens": 107400637.0, "step": 973 }, { "epoch": 7.162361623616236, "grad_norm": 0.6672124661795636, "learning_rate": 2.719699676767641e-06, "loss": 0.1628, "num_tokens": 107507287.0, "step": 974 }, { "epoch": 7.169741697416974, "grad_norm": 0.7514636968559429, "learning_rate": 2.711215668817046e-06, "loss": 0.1542, "num_tokens": 107610980.0, "step": 975 }, { "epoch": 7.177121771217712, "grad_norm": 0.7856635815707779, "learning_rate": 2.7027477241813628e-06, "loss": 0.1407, "num_tokens": 107711542.0, "step": 976 }, { "epoch": 7.18450184501845, "grad_norm": 0.6777094530338031, "learning_rate": 2.6942958916356997e-06, "loss": 0.1179, "num_tokens": 107809906.0, "step": 977 }, { "epoch": 7.191881918819188, "grad_norm": 0.6778942587881792, "learning_rate": 2.685860219862362e-06, "loss": 0.1601, "num_tokens": 107940488.0, "step": 978 }, { "epoch": 7.199261992619927, "grad_norm": 0.7379615357781537, "learning_rate": 2.6774407574505677e-06, "loss": 0.1212, "num_tokens": 108041747.0, "step": 979 }, { "epoch": 7.206642066420664, "grad_norm": 0.6808474625734778, "learning_rate": 2.669037552896172e-06, "loss": 0.1769, "num_tokens": 108164572.0, "step": 980 }, { "epoch": 7.214022140221402, "grad_norm": 0.7613234200216101, "learning_rate": 2.6606506546013813e-06, "loss": 0.1268, "num_tokens": 108272656.0, "step": 981 }, { "epoch": 7.22140221402214, "grad_norm": 0.6673755565328315, "learning_rate": 2.65228011087448e-06, "loss": 0.1348, "num_tokens": 108363753.0, "step": 982 }, { "epoch": 7.228782287822878, "grad_norm": 0.7883723389050911, "learning_rate": 2.643925969929555e-06, "loss": 0.1738, "num_tokens": 108469274.0, "step": 983 }, { "epoch": 7.236162361623617, "grad_norm": 0.7164678591014224, "learning_rate": 2.635588279886207e-06, "loss": 0.12, "num_tokens": 108562632.0, "step": 984 }, { "epoch": 7.243542435424354, "grad_norm": 0.7832327287230083, "learning_rate": 2.6272670887692832e-06, "loss": 0.1707, "num_tokens": 108680167.0, "step": 985 }, { "epoch": 7.250922509225092, "grad_norm": 0.5796419139653467, "learning_rate": 2.618962444508599e-06, "loss": 0.642, "num_tokens": 108850176.0, "step": 986 }, { "epoch": 7.25830258302583, "grad_norm": 0.6134482139402445, "learning_rate": 2.6106743949386585e-06, "loss": 0.1114, "num_tokens": 108946874.0, "step": 987 }, { "epoch": 7.265682656826568, "grad_norm": 0.630596461450639, "learning_rate": 2.6024029877983804e-06, "loss": 0.1575, "num_tokens": 109090243.0, "step": 988 }, { "epoch": 7.273062730627307, "grad_norm": 0.7023239819899377, "learning_rate": 2.594148270730823e-06, "loss": 0.143, "num_tokens": 109218260.0, "step": 989 }, { "epoch": 7.280442804428044, "grad_norm": 0.7603245714545737, "learning_rate": 2.5859102912829127e-06, "loss": 0.1351, "num_tokens": 109328685.0, "step": 990 }, { "epoch": 7.287822878228782, "grad_norm": 0.6309167253692112, "learning_rate": 2.577689096905166e-06, "loss": 0.1456, "num_tokens": 109445171.0, "step": 991 }, { "epoch": 7.29520295202952, "grad_norm": 0.6396034312241526, "learning_rate": 2.5694847349514175e-06, "loss": 0.1354, "num_tokens": 109545079.0, "step": 992 }, { "epoch": 7.302583025830258, "grad_norm": 0.7331489811512829, "learning_rate": 2.56129725267855e-06, "loss": 0.1549, "num_tokens": 109679753.0, "step": 993 }, { "epoch": 7.3099630996309966, "grad_norm": 0.7132913901541111, "learning_rate": 2.5531266972462176e-06, "loss": 0.2337, "num_tokens": 109845544.0, "step": 994 }, { "epoch": 7.317343173431734, "grad_norm": 0.672421454522611, "learning_rate": 2.544973115716577e-06, "loss": 0.1603, "num_tokens": 109978598.0, "step": 995 }, { "epoch": 7.324723247232472, "grad_norm": 0.7517681603828581, "learning_rate": 2.5368365550540154e-06, "loss": 0.135, "num_tokens": 110068508.0, "step": 996 }, { "epoch": 7.332103321033211, "grad_norm": 0.6560651437258584, "learning_rate": 2.52871706212488e-06, "loss": 0.1427, "num_tokens": 110140082.0, "step": 997 }, { "epoch": 7.339483394833948, "grad_norm": 0.7577820555330438, "learning_rate": 2.5206146836972102e-06, "loss": 0.1477, "num_tokens": 110256850.0, "step": 998 }, { "epoch": 7.3468634686346865, "grad_norm": 0.6606555518773797, "learning_rate": 2.5125294664404635e-06, "loss": 0.1314, "num_tokens": 110356826.0, "step": 999 }, { "epoch": 7.354243542435424, "grad_norm": 0.7813586653528403, "learning_rate": 2.504461456925251e-06, "loss": 0.1547, "num_tokens": 110458059.0, "step": 1000 }, { "epoch": 7.361623616236162, "grad_norm": 0.7056250932214523, "learning_rate": 2.4964107016230703e-06, "loss": 0.2277, "num_tokens": 110588624.0, "step": 1001 }, { "epoch": 7.369003690036901, "grad_norm": 0.7183263827162158, "learning_rate": 2.488377246906031e-06, "loss": 0.1053, "num_tokens": 110675900.0, "step": 1002 }, { "epoch": 7.376383763837638, "grad_norm": 0.6558940456536225, "learning_rate": 2.4803611390465925e-06, "loss": 0.1339, "num_tokens": 110782752.0, "step": 1003 }, { "epoch": 7.3837638376383765, "grad_norm": 0.6936986894485098, "learning_rate": 2.4723624242173007e-06, "loss": 0.1548, "num_tokens": 110899453.0, "step": 1004 }, { "epoch": 7.391143911439114, "grad_norm": 0.7001137198073798, "learning_rate": 2.4643811484905145e-06, "loss": 0.147, "num_tokens": 111051369.0, "step": 1005 }, { "epoch": 7.398523985239852, "grad_norm": 0.7131543193897704, "learning_rate": 2.4564173578381447e-06, "loss": 0.1229, "num_tokens": 111147927.0, "step": 1006 }, { "epoch": 7.405904059040591, "grad_norm": 0.7784078969716604, "learning_rate": 2.4484710981313883e-06, "loss": 0.1441, "num_tokens": 111245874.0, "step": 1007 }, { "epoch": 7.413284132841328, "grad_norm": 0.6840839356194213, "learning_rate": 2.4405424151404664e-06, "loss": 0.4253, "num_tokens": 111385282.0, "step": 1008 }, { "epoch": 7.4206642066420665, "grad_norm": 0.8005602878365277, "learning_rate": 2.432631354534355e-06, "loss": 0.1455, "num_tokens": 111508532.0, "step": 1009 }, { "epoch": 7.428044280442805, "grad_norm": 0.7005436952515709, "learning_rate": 2.424737961880531e-06, "loss": 0.1318, "num_tokens": 111603739.0, "step": 1010 }, { "epoch": 7.435424354243542, "grad_norm": 0.656941062893856, "learning_rate": 2.4168622826447016e-06, "loss": 0.1288, "num_tokens": 111719579.0, "step": 1011 }, { "epoch": 7.442804428044281, "grad_norm": 0.7944245640607924, "learning_rate": 2.4090043621905435e-06, "loss": 0.1663, "num_tokens": 111825869.0, "step": 1012 }, { "epoch": 7.450184501845018, "grad_norm": 0.880039654114411, "learning_rate": 2.401164245779447e-06, "loss": 0.1286, "num_tokens": 111920941.0, "step": 1013 }, { "epoch": 7.4575645756457565, "grad_norm": 0.6486686170838069, "learning_rate": 2.3933419785702476e-06, "loss": 0.146, "num_tokens": 112044133.0, "step": 1014 }, { "epoch": 7.464944649446495, "grad_norm": 0.7008456334929012, "learning_rate": 2.385537605618974e-06, "loss": 0.1562, "num_tokens": 112147601.0, "step": 1015 }, { "epoch": 7.472324723247232, "grad_norm": 0.8614114433536172, "learning_rate": 2.377751171878581e-06, "loss": 0.1754, "num_tokens": 112286299.0, "step": 1016 }, { "epoch": 7.479704797047971, "grad_norm": 0.6006994620540834, "learning_rate": 2.369982722198697e-06, "loss": 0.135, "num_tokens": 112397315.0, "step": 1017 }, { "epoch": 7.487084870848708, "grad_norm": 0.6421371758364183, "learning_rate": 2.3622323013253595e-06, "loss": 0.1827, "num_tokens": 112551274.0, "step": 1018 }, { "epoch": 7.4944649446494465, "grad_norm": 0.6116215442679512, "learning_rate": 2.354499953900765e-06, "loss": 0.1276, "num_tokens": 112692453.0, "step": 1019 }, { "epoch": 7.501845018450185, "grad_norm": 0.6575972998841045, "learning_rate": 2.346785724463002e-06, "loss": 0.1272, "num_tokens": 112781826.0, "step": 1020 }, { "epoch": 7.509225092250922, "grad_norm": 0.7461270099639635, "learning_rate": 2.339089657445807e-06, "loss": 0.1481, "num_tokens": 112886040.0, "step": 1021 }, { "epoch": 7.516605166051661, "grad_norm": 0.6313586444022081, "learning_rate": 2.3314117971782947e-06, "loss": 0.1456, "num_tokens": 113017230.0, "step": 1022 }, { "epoch": 7.523985239852399, "grad_norm": 0.7039813774470369, "learning_rate": 2.3237521878847128e-06, "loss": 0.1586, "num_tokens": 113126819.0, "step": 1023 }, { "epoch": 7.531365313653136, "grad_norm": 0.7964284587475011, "learning_rate": 2.316110873684183e-06, "loss": 0.1183, "num_tokens": 113225040.0, "step": 1024 }, { "epoch": 7.538745387453875, "grad_norm": 0.7294701258254773, "learning_rate": 2.308487898590448e-06, "loss": 0.1444, "num_tokens": 113327340.0, "step": 1025 }, { "epoch": 7.546125461254612, "grad_norm": 0.877832148971536, "learning_rate": 2.3008833065116173e-06, "loss": 0.1451, "num_tokens": 113406386.0, "step": 1026 }, { "epoch": 7.553505535055351, "grad_norm": 0.8266989648256501, "learning_rate": 2.2932971412499173e-06, "loss": 0.1622, "num_tokens": 113511600.0, "step": 1027 }, { "epoch": 7.560885608856088, "grad_norm": 0.7095373348730436, "learning_rate": 2.285729446501434e-06, "loss": 0.1556, "num_tokens": 113654802.0, "step": 1028 }, { "epoch": 7.568265682656826, "grad_norm": 0.577958730374615, "learning_rate": 2.2781802658558636e-06, "loss": 0.117, "num_tokens": 113770358.0, "step": 1029 }, { "epoch": 7.575645756457565, "grad_norm": 0.8026599393917855, "learning_rate": 2.2706496427962633e-06, "loss": 0.1556, "num_tokens": 113842420.0, "step": 1030 }, { "epoch": 7.583025830258302, "grad_norm": 0.8670380842470107, "learning_rate": 2.263137620698797e-06, "loss": 0.1503, "num_tokens": 113942402.0, "step": 1031 }, { "epoch": 7.590405904059041, "grad_norm": 0.6459757734058699, "learning_rate": 2.2556442428324896e-06, "loss": 0.112, "num_tokens": 114049763.0, "step": 1032 }, { "epoch": 7.597785977859779, "grad_norm": 0.6198832800578641, "learning_rate": 2.2481695523589747e-06, "loss": 0.1508, "num_tokens": 114193985.0, "step": 1033 }, { "epoch": 7.605166051660516, "grad_norm": 0.6464652228137838, "learning_rate": 2.240713592332248e-06, "loss": 0.1414, "num_tokens": 114287318.0, "step": 1034 }, { "epoch": 7.612546125461255, "grad_norm": 0.6911239323885385, "learning_rate": 2.2332764056984156e-06, "loss": 0.3724, "num_tokens": 114383351.0, "step": 1035 }, { "epoch": 7.619926199261993, "grad_norm": 0.7110968352750003, "learning_rate": 2.2258580352954558e-06, "loss": 0.1471, "num_tokens": 114482588.0, "step": 1036 }, { "epoch": 7.627306273062731, "grad_norm": 0.6758906829132127, "learning_rate": 2.2184585238529584e-06, "loss": 0.1284, "num_tokens": 114581882.0, "step": 1037 }, { "epoch": 7.634686346863469, "grad_norm": 0.6890951539697263, "learning_rate": 2.2110779139918893e-06, "loss": 0.172, "num_tokens": 114687278.0, "step": 1038 }, { "epoch": 7.642066420664206, "grad_norm": 0.7597164898491231, "learning_rate": 2.2037162482243445e-06, "loss": 0.1208, "num_tokens": 114786568.0, "step": 1039 }, { "epoch": 7.649446494464945, "grad_norm": 0.7074984062973345, "learning_rate": 2.1963735689532993e-06, "loss": 0.1299, "num_tokens": 114870784.0, "step": 1040 }, { "epoch": 7.656826568265682, "grad_norm": 0.692609386390428, "learning_rate": 2.189049918472368e-06, "loss": 0.182, "num_tokens": 115000403.0, "step": 1041 }, { "epoch": 7.6642066420664205, "grad_norm": 0.825979291195909, "learning_rate": 2.1817453389655597e-06, "loss": 0.1265, "num_tokens": 115085653.0, "step": 1042 }, { "epoch": 7.671586715867159, "grad_norm": 0.591110702939691, "learning_rate": 2.174459872507035e-06, "loss": 0.1313, "num_tokens": 115199605.0, "step": 1043 }, { "epoch": 7.678966789667896, "grad_norm": 0.6922804180876169, "learning_rate": 2.167193561060863e-06, "loss": 0.1422, "num_tokens": 115316924.0, "step": 1044 }, { "epoch": 7.686346863468635, "grad_norm": 0.8015568869876212, "learning_rate": 2.1599464464807856e-06, "loss": 0.1639, "num_tokens": 115424183.0, "step": 1045 }, { "epoch": 7.693726937269373, "grad_norm": 0.795673213014524, "learning_rate": 2.1527185705099646e-06, "loss": 0.1465, "num_tokens": 115526922.0, "step": 1046 }, { "epoch": 7.7011070110701105, "grad_norm": 0.6167523878689091, "learning_rate": 2.145509974780752e-06, "loss": 0.1195, "num_tokens": 115618628.0, "step": 1047 }, { "epoch": 7.708487084870849, "grad_norm": 0.6940445233539998, "learning_rate": 2.1383207008144447e-06, "loss": 0.1225, "num_tokens": 115709416.0, "step": 1048 }, { "epoch": 7.715867158671586, "grad_norm": 0.8780760736913564, "learning_rate": 2.131150790021047e-06, "loss": 0.2002, "num_tokens": 115808392.0, "step": 1049 }, { "epoch": 7.723247232472325, "grad_norm": 0.6517738449681595, "learning_rate": 2.124000283699033e-06, "loss": 0.1405, "num_tokens": 115925056.0, "step": 1050 }, { "epoch": 7.730627306273063, "grad_norm": 0.7288093305351533, "learning_rate": 2.1168692230351056e-06, "loss": 0.1343, "num_tokens": 116026918.0, "step": 1051 }, { "epoch": 7.7380073800738005, "grad_norm": 0.7310571676229276, "learning_rate": 2.1097576491039616e-06, "loss": 0.159, "num_tokens": 116140965.0, "step": 1052 }, { "epoch": 7.745387453874539, "grad_norm": 0.7922586445064035, "learning_rate": 2.1026656028680577e-06, "loss": 0.187, "num_tokens": 116259358.0, "step": 1053 }, { "epoch": 7.752767527675276, "grad_norm": 0.8076731960056938, "learning_rate": 2.0955931251773694e-06, "loss": 0.1511, "num_tokens": 116366422.0, "step": 1054 }, { "epoch": 7.760147601476015, "grad_norm": 0.7714587035390015, "learning_rate": 2.088540256769157e-06, "loss": 0.1583, "num_tokens": 116469692.0, "step": 1055 }, { "epoch": 7.767527675276753, "grad_norm": 0.8724198754759499, "learning_rate": 2.0815070382677325e-06, "loss": 0.1346, "num_tokens": 116558086.0, "step": 1056 }, { "epoch": 7.7749077490774905, "grad_norm": 0.773017521357972, "learning_rate": 2.0744935101842277e-06, "loss": 0.1553, "num_tokens": 116675823.0, "step": 1057 }, { "epoch": 7.782287822878229, "grad_norm": 0.7258629619643803, "learning_rate": 2.067499712916355e-06, "loss": 0.1096, "num_tokens": 116763713.0, "step": 1058 }, { "epoch": 7.789667896678967, "grad_norm": 0.7194357884958432, "learning_rate": 2.060525686748179e-06, "loss": 0.1348, "num_tokens": 116851558.0, "step": 1059 }, { "epoch": 7.797047970479705, "grad_norm": 0.6692815808770822, "learning_rate": 2.0535714718498824e-06, "loss": 0.1496, "num_tokens": 116968472.0, "step": 1060 }, { "epoch": 7.804428044280443, "grad_norm": 0.7919788891125946, "learning_rate": 2.0466371082775362e-06, "loss": 0.1468, "num_tokens": 117069879.0, "step": 1061 }, { "epoch": 7.8118081180811805, "grad_norm": 0.7147707246771277, "learning_rate": 2.0397226359728705e-06, "loss": 0.1506, "num_tokens": 117210319.0, "step": 1062 }, { "epoch": 7.819188191881919, "grad_norm": 0.7144100080912492, "learning_rate": 2.03282809476304e-06, "loss": 0.1458, "num_tokens": 117312938.0, "step": 1063 }, { "epoch": 7.826568265682657, "grad_norm": 0.8295047610651002, "learning_rate": 2.025953524360396e-06, "loss": 0.1635, "num_tokens": 117439801.0, "step": 1064 }, { "epoch": 7.833948339483395, "grad_norm": 0.7163457526537156, "learning_rate": 2.0190989643622615e-06, "loss": 0.125, "num_tokens": 117522839.0, "step": 1065 }, { "epoch": 7.841328413284133, "grad_norm": 0.7292120907586226, "learning_rate": 2.012264454250697e-06, "loss": 0.1251, "num_tokens": 117616016.0, "step": 1066 }, { "epoch": 7.8487084870848705, "grad_norm": 0.6896242843049234, "learning_rate": 2.0054500333922783e-06, "loss": 0.1635, "num_tokens": 117723281.0, "step": 1067 }, { "epoch": 7.856088560885609, "grad_norm": 0.760110462390299, "learning_rate": 1.998655741037867e-06, "loss": 0.1686, "num_tokens": 117868155.0, "step": 1068 }, { "epoch": 7.863468634686347, "grad_norm": 0.6419790312136794, "learning_rate": 1.9918816163223847e-06, "loss": 0.1311, "num_tokens": 117969002.0, "step": 1069 }, { "epoch": 7.870848708487085, "grad_norm": 0.7127690713983089, "learning_rate": 1.985127698264589e-06, "loss": 0.1232, "num_tokens": 118080125.0, "step": 1070 }, { "epoch": 7.878228782287823, "grad_norm": 0.6059823842723294, "learning_rate": 1.9783940257668475e-06, "loss": 0.1805, "num_tokens": 118205028.0, "step": 1071 }, { "epoch": 7.885608856088561, "grad_norm": 0.7724954346608415, "learning_rate": 1.971680637614915e-06, "loss": 0.2085, "num_tokens": 118327856.0, "step": 1072 }, { "epoch": 7.892988929889299, "grad_norm": 0.6360178773299939, "learning_rate": 1.964987572477706e-06, "loss": 0.1432, "num_tokens": 118434556.0, "step": 1073 }, { "epoch": 7.900369003690037, "grad_norm": 0.8040243280272408, "learning_rate": 1.9583148689070762e-06, "loss": 0.1277, "num_tokens": 118536005.0, "step": 1074 }, { "epoch": 7.907749077490775, "grad_norm": 0.638107685289796, "learning_rate": 1.9516625653376027e-06, "loss": 0.1282, "num_tokens": 118639759.0, "step": 1075 }, { "epoch": 7.915129151291513, "grad_norm": 0.6035748385223451, "learning_rate": 1.9450307000863546e-06, "loss": 0.1669, "num_tokens": 118820768.0, "step": 1076 }, { "epoch": 7.922509225092251, "grad_norm": 0.6903668767059657, "learning_rate": 1.9384193113526793e-06, "loss": 0.1884, "num_tokens": 118936693.0, "step": 1077 }, { "epoch": 7.929889298892989, "grad_norm": 0.7353679463674523, "learning_rate": 1.9318284372179784e-06, "loss": 0.1454, "num_tokens": 119049510.0, "step": 1078 }, { "epoch": 7.937269372693727, "grad_norm": 0.8100224185456285, "learning_rate": 1.925258115645493e-06, "loss": 0.166, "num_tokens": 119151079.0, "step": 1079 }, { "epoch": 7.944649446494465, "grad_norm": 0.682480326618363, "learning_rate": 1.9187083844800795e-06, "loss": 0.161, "num_tokens": 119275485.0, "step": 1080 }, { "epoch": 7.952029520295203, "grad_norm": 0.7756070561036585, "learning_rate": 1.9121792814479947e-06, "loss": 0.1324, "num_tokens": 119384383.0, "step": 1081 }, { "epoch": 7.959409594095941, "grad_norm": 0.6820620129417011, "learning_rate": 1.9056708441566784e-06, "loss": 0.1412, "num_tokens": 119486138.0, "step": 1082 }, { "epoch": 7.966789667896679, "grad_norm": 0.6962863439355318, "learning_rate": 1.8991831100945351e-06, "loss": 0.1676, "num_tokens": 119570636.0, "step": 1083 }, { "epoch": 7.974169741697417, "grad_norm": 1.083753376552152, "learning_rate": 1.8927161166307212e-06, "loss": 0.1658, "num_tokens": 119669594.0, "step": 1084 }, { "epoch": 7.9815498154981555, "grad_norm": 0.6265816477912981, "learning_rate": 1.8862699010149269e-06, "loss": 0.1916, "num_tokens": 119771467.0, "step": 1085 }, { "epoch": 7.988929889298893, "grad_norm": 0.8302480419180455, "learning_rate": 1.8798445003771622e-06, "loss": 0.4692, "num_tokens": 119926781.0, "step": 1086 }, { "epoch": 7.996309963099631, "grad_norm": 0.6010145896400855, "learning_rate": 1.8734399517275434e-06, "loss": 0.1701, "num_tokens": 120069416.0, "step": 1087 }, { "epoch": 8.0, "grad_norm": 1.124987574983492, "learning_rate": 1.867056291956082e-06, "loss": 0.182, "num_tokens": 120150035.0, "step": 1088 }, { "epoch": 8.007380073800737, "grad_norm": 0.607427419879896, "learning_rate": 1.8606935578324687e-06, "loss": 0.1024, "num_tokens": 120248370.0, "step": 1089 }, { "epoch": 8.014760147601477, "grad_norm": 0.6535965619098806, "learning_rate": 1.8543517860058619e-06, "loss": 0.1196, "num_tokens": 120340917.0, "step": 1090 }, { "epoch": 8.022140221402214, "grad_norm": 0.6294450411059588, "learning_rate": 1.848031013004678e-06, "loss": 0.1035, "num_tokens": 120426701.0, "step": 1091 }, { "epoch": 8.029520295202952, "grad_norm": 0.5688460646507363, "learning_rate": 1.8417312752363844e-06, "loss": 0.138, "num_tokens": 120588615.0, "step": 1092 }, { "epoch": 8.03690036900369, "grad_norm": 0.5995824249286849, "learning_rate": 1.8354526089872826e-06, "loss": 0.1419, "num_tokens": 120698924.0, "step": 1093 }, { "epoch": 8.044280442804428, "grad_norm": 0.6075907318827694, "learning_rate": 1.8291950504223033e-06, "loss": 0.1039, "num_tokens": 120811933.0, "step": 1094 }, { "epoch": 8.051660516605166, "grad_norm": 0.6933032668859173, "learning_rate": 1.8229586355847978e-06, "loss": 0.1124, "num_tokens": 120922751.0, "step": 1095 }, { "epoch": 8.059040590405903, "grad_norm": 0.6970315340175753, "learning_rate": 1.816743400396329e-06, "loss": 0.0972, "num_tokens": 121034194.0, "step": 1096 }, { "epoch": 8.066420664206642, "grad_norm": 0.7617170293501341, "learning_rate": 1.81054938065647e-06, "loss": 0.1269, "num_tokens": 121140277.0, "step": 1097 }, { "epoch": 8.07380073800738, "grad_norm": 0.7173193240486126, "learning_rate": 1.804376612042589e-06, "loss": 0.0877, "num_tokens": 121213531.0, "step": 1098 }, { "epoch": 8.081180811808117, "grad_norm": 0.7090653918012082, "learning_rate": 1.7982251301096498e-06, "loss": 0.098, "num_tokens": 121336819.0, "step": 1099 }, { "epoch": 8.088560885608857, "grad_norm": 0.5605275633072052, "learning_rate": 1.7920949702900058e-06, "loss": 0.0748, "num_tokens": 121460540.0, "step": 1100 }, { "epoch": 8.095940959409594, "grad_norm": 0.9019987403479388, "learning_rate": 1.785986167893195e-06, "loss": 0.116, "num_tokens": 121546617.0, "step": 1101 }, { "epoch": 8.103321033210332, "grad_norm": 0.8064867967283718, "learning_rate": 1.7798987581057386e-06, "loss": 0.1114, "num_tokens": 121628635.0, "step": 1102 }, { "epoch": 8.11070110701107, "grad_norm": 0.5272843009986005, "learning_rate": 1.7738327759909354e-06, "loss": 0.0954, "num_tokens": 121757053.0, "step": 1103 }, { "epoch": 8.118081180811808, "grad_norm": 0.6908802325389886, "learning_rate": 1.7677882564886618e-06, "loss": 0.1085, "num_tokens": 121851076.0, "step": 1104 }, { "epoch": 8.125461254612546, "grad_norm": 0.6822375336604328, "learning_rate": 1.761765234415172e-06, "loss": 0.1166, "num_tokens": 121955294.0, "step": 1105 }, { "epoch": 8.132841328413285, "grad_norm": 0.48922879377109746, "learning_rate": 1.7557637444628935e-06, "loss": 0.0955, "num_tokens": 122086501.0, "step": 1106 }, { "epoch": 8.140221402214022, "grad_norm": 0.6797930896552248, "learning_rate": 1.74978382120023e-06, "loss": 0.1186, "num_tokens": 122163798.0, "step": 1107 }, { "epoch": 8.14760147601476, "grad_norm": 0.6264128859132683, "learning_rate": 1.743825499071362e-06, "loss": 0.1076, "num_tokens": 122277678.0, "step": 1108 }, { "epoch": 8.154981549815497, "grad_norm": 0.7398057787194479, "learning_rate": 1.7378888123960474e-06, "loss": 0.1053, "num_tokens": 122365248.0, "step": 1109 }, { "epoch": 8.162361623616237, "grad_norm": 0.6466976054306609, "learning_rate": 1.7319737953694267e-06, "loss": 0.1067, "num_tokens": 122451453.0, "step": 1110 }, { "epoch": 8.169741697416974, "grad_norm": 0.5520221171044929, "learning_rate": 1.7260804820618207e-06, "loss": 0.1034, "num_tokens": 122543362.0, "step": 1111 }, { "epoch": 8.177121771217712, "grad_norm": 0.7733568794396356, "learning_rate": 1.72020890641854e-06, "loss": 0.1072, "num_tokens": 122647562.0, "step": 1112 }, { "epoch": 8.18450184501845, "grad_norm": 0.5953487947405357, "learning_rate": 1.7143591022596846e-06, "loss": 0.1116, "num_tokens": 122771485.0, "step": 1113 }, { "epoch": 8.191881918819188, "grad_norm": 0.5452828548484447, "learning_rate": 1.708531103279954e-06, "loss": 0.0781, "num_tokens": 122880126.0, "step": 1114 }, { "epoch": 8.199261992619926, "grad_norm": 0.7244406131025011, "learning_rate": 1.7027249430484496e-06, "loss": 0.1455, "num_tokens": 123011714.0, "step": 1115 }, { "epoch": 8.206642066420665, "grad_norm": 0.6286165801135194, "learning_rate": 1.6969406550084805e-06, "loss": 0.1054, "num_tokens": 123119734.0, "step": 1116 }, { "epoch": 8.214022140221402, "grad_norm": 0.702033524190885, "learning_rate": 1.691178272477375e-06, "loss": 0.419, "num_tokens": 123240721.0, "step": 1117 }, { "epoch": 8.22140221402214, "grad_norm": 0.7033012508596181, "learning_rate": 1.6854378286462844e-06, "loss": 0.1303, "num_tokens": 123377125.0, "step": 1118 }, { "epoch": 8.228782287822877, "grad_norm": 0.6982785557123685, "learning_rate": 1.6797193565799955e-06, "loss": 0.1361, "num_tokens": 123493175.0, "step": 1119 }, { "epoch": 8.236162361623617, "grad_norm": 0.5384384152900734, "learning_rate": 1.674022889216737e-06, "loss": 0.4249, "num_tokens": 123635160.0, "step": 1120 }, { "epoch": 8.243542435424354, "grad_norm": 0.5900790975693391, "learning_rate": 1.668348459367992e-06, "loss": 0.0914, "num_tokens": 123741979.0, "step": 1121 }, { "epoch": 8.250922509225092, "grad_norm": 0.6652651245218247, "learning_rate": 1.6626960997183074e-06, "loss": 0.1206, "num_tokens": 123852328.0, "step": 1122 }, { "epoch": 8.25830258302583, "grad_norm": 0.5755410301347661, "learning_rate": 1.6570658428251075e-06, "loss": 0.1731, "num_tokens": 124007659.0, "step": 1123 }, { "epoch": 8.265682656826568, "grad_norm": 0.6436584539623239, "learning_rate": 1.6514577211185046e-06, "loss": 0.1111, "num_tokens": 124089404.0, "step": 1124 }, { "epoch": 8.273062730627306, "grad_norm": 0.6593239587931535, "learning_rate": 1.6458717669011127e-06, "loss": 0.1091, "num_tokens": 124194927.0, "step": 1125 }, { "epoch": 8.280442804428045, "grad_norm": 0.6676510191448691, "learning_rate": 1.6403080123478631e-06, "loss": 0.1269, "num_tokens": 124305690.0, "step": 1126 }, { "epoch": 8.287822878228782, "grad_norm": 0.6024940770564627, "learning_rate": 1.6347664895058151e-06, "loss": 0.1102, "num_tokens": 124423653.0, "step": 1127 }, { "epoch": 8.29520295202952, "grad_norm": 0.5704030544841321, "learning_rate": 1.6292472302939776e-06, "loss": 0.1236, "num_tokens": 124535771.0, "step": 1128 }, { "epoch": 8.302583025830259, "grad_norm": 0.6050519524784308, "learning_rate": 1.6237502665031188e-06, "loss": 0.1082, "num_tokens": 124635652.0, "step": 1129 }, { "epoch": 8.309963099630997, "grad_norm": 0.7476529098493471, "learning_rate": 1.6182756297955865e-06, "loss": 0.1363, "num_tokens": 124768210.0, "step": 1130 }, { "epoch": 8.317343173431734, "grad_norm": 0.6800745728650065, "learning_rate": 1.6128233517051267e-06, "loss": 0.098, "num_tokens": 124873080.0, "step": 1131 }, { "epoch": 8.324723247232471, "grad_norm": 0.6426860301671826, "learning_rate": 1.6073934636366983e-06, "loss": 0.1192, "num_tokens": 124997839.0, "step": 1132 }, { "epoch": 8.33210332103321, "grad_norm": 0.7184882712225652, "learning_rate": 1.6019859968662956e-06, "loss": 0.0982, "num_tokens": 125083235.0, "step": 1133 }, { "epoch": 8.339483394833948, "grad_norm": 0.5603381730436606, "learning_rate": 1.5966009825407666e-06, "loss": 0.0978, "num_tokens": 125204535.0, "step": 1134 }, { "epoch": 8.346863468634686, "grad_norm": 0.6530561101690034, "learning_rate": 1.591238451677634e-06, "loss": 0.1098, "num_tokens": 125342218.0, "step": 1135 }, { "epoch": 8.354243542435425, "grad_norm": 0.7113364303593663, "learning_rate": 1.5858984351649157e-06, "loss": 0.1487, "num_tokens": 125455802.0, "step": 1136 }, { "epoch": 8.361623616236162, "grad_norm": 0.6636500868696222, "learning_rate": 1.5805809637609482e-06, "loss": 0.1102, "num_tokens": 125601963.0, "step": 1137 }, { "epoch": 8.3690036900369, "grad_norm": 0.6106766856622697, "learning_rate": 1.5752860680942094e-06, "loss": 0.0908, "num_tokens": 125685704.0, "step": 1138 }, { "epoch": 8.376383763837639, "grad_norm": 0.6689267162919238, "learning_rate": 1.5700137786631404e-06, "loss": 0.0895, "num_tokens": 125791716.0, "step": 1139 }, { "epoch": 8.383763837638377, "grad_norm": 0.6209676350541423, "learning_rate": 1.5647641258359724e-06, "loss": 0.144, "num_tokens": 125897551.0, "step": 1140 }, { "epoch": 8.391143911439114, "grad_norm": 0.9232576930715772, "learning_rate": 1.5595371398505498e-06, "loss": 0.1248, "num_tokens": 126006191.0, "step": 1141 }, { "epoch": 8.398523985239853, "grad_norm": 0.5957913492192587, "learning_rate": 1.5543328508141565e-06, "loss": 0.1278, "num_tokens": 126140879.0, "step": 1142 }, { "epoch": 8.40590405904059, "grad_norm": 0.7026002448751827, "learning_rate": 1.5491512887033427e-06, "loss": 0.5765, "num_tokens": 126251498.0, "step": 1143 }, { "epoch": 8.413284132841328, "grad_norm": 0.6156038903064762, "learning_rate": 1.5439924833637514e-06, "loss": 0.0948, "num_tokens": 126351415.0, "step": 1144 }, { "epoch": 8.420664206642066, "grad_norm": 0.6115076221141882, "learning_rate": 1.5388564645099486e-06, "loss": 0.1236, "num_tokens": 126493084.0, "step": 1145 }, { "epoch": 8.428044280442805, "grad_norm": 0.519714845928742, "learning_rate": 1.533743261725251e-06, "loss": 0.1313, "num_tokens": 126605851.0, "step": 1146 }, { "epoch": 8.435424354243542, "grad_norm": 0.6145661780904482, "learning_rate": 1.528652904461555e-06, "loss": 0.0969, "num_tokens": 126703227.0, "step": 1147 }, { "epoch": 8.44280442804428, "grad_norm": 0.62380039781695, "learning_rate": 1.5235854220391653e-06, "loss": 0.1305, "num_tokens": 126837789.0, "step": 1148 }, { "epoch": 8.450184501845019, "grad_norm": 0.6278604652828446, "learning_rate": 1.518540843646632e-06, "loss": 0.0936, "num_tokens": 126977414.0, "step": 1149 }, { "epoch": 8.457564575645756, "grad_norm": 0.6673911003891321, "learning_rate": 1.5135191983405767e-06, "loss": 0.1149, "num_tokens": 127094861.0, "step": 1150 }, { "epoch": 8.464944649446494, "grad_norm": 0.5899818625924016, "learning_rate": 1.5085205150455266e-06, "loss": 0.1013, "num_tokens": 127222864.0, "step": 1151 }, { "epoch": 8.472324723247233, "grad_norm": 0.5779369770190897, "learning_rate": 1.5035448225537493e-06, "loss": 0.097, "num_tokens": 127336404.0, "step": 1152 }, { "epoch": 8.47970479704797, "grad_norm": 0.7033787316439764, "learning_rate": 1.4985921495250852e-06, "loss": 0.1124, "num_tokens": 127444110.0, "step": 1153 }, { "epoch": 8.487084870848708, "grad_norm": 0.7973835737567503, "learning_rate": 1.4936625244867845e-06, "loss": 0.1048, "num_tokens": 127540650.0, "step": 1154 }, { "epoch": 8.494464944649447, "grad_norm": 0.6013487090031089, "learning_rate": 1.4887559758333408e-06, "loss": 0.1008, "num_tokens": 127655205.0, "step": 1155 }, { "epoch": 8.501845018450185, "grad_norm": 0.5842927860924219, "learning_rate": 1.4838725318263273e-06, "loss": 0.1021, "num_tokens": 127747864.0, "step": 1156 }, { "epoch": 8.509225092250922, "grad_norm": 0.6318394517015664, "learning_rate": 1.4790122205942387e-06, "loss": 0.1166, "num_tokens": 127864117.0, "step": 1157 }, { "epoch": 8.51660516605166, "grad_norm": 0.6459305199339671, "learning_rate": 1.474175070132322e-06, "loss": 0.1149, "num_tokens": 127971408.0, "step": 1158 }, { "epoch": 8.523985239852399, "grad_norm": 0.5531547408810842, "learning_rate": 1.4693611083024209e-06, "loss": 0.2405, "num_tokens": 128069774.0, "step": 1159 }, { "epoch": 8.531365313653136, "grad_norm": 0.6167450947611348, "learning_rate": 1.464570362832812e-06, "loss": 0.0934, "num_tokens": 128180206.0, "step": 1160 }, { "epoch": 8.538745387453874, "grad_norm": 0.6232744033713421, "learning_rate": 1.4598028613180468e-06, "loss": 0.0963, "num_tokens": 128299598.0, "step": 1161 }, { "epoch": 8.546125461254613, "grad_norm": 0.744957544001579, "learning_rate": 1.455058631218792e-06, "loss": 0.0975, "num_tokens": 128391605.0, "step": 1162 }, { "epoch": 8.55350553505535, "grad_norm": 0.7185687853758047, "learning_rate": 1.450337699861673e-06, "loss": 0.1018, "num_tokens": 128494086.0, "step": 1163 }, { "epoch": 8.560885608856088, "grad_norm": 0.6598711587628634, "learning_rate": 1.4456400944391147e-06, "loss": 0.1067, "num_tokens": 128627405.0, "step": 1164 }, { "epoch": 8.568265682656827, "grad_norm": 0.7096790520934679, "learning_rate": 1.440965842009182e-06, "loss": 0.0955, "num_tokens": 128709021.0, "step": 1165 }, { "epoch": 8.575645756457565, "grad_norm": 0.7331549322698515, "learning_rate": 1.4363149694954335e-06, "loss": 0.4428, "num_tokens": 128879133.0, "step": 1166 }, { "epoch": 8.583025830258302, "grad_norm": 0.5965199139226404, "learning_rate": 1.4316875036867555e-06, "loss": 0.0988, "num_tokens": 129002145.0, "step": 1167 }, { "epoch": 8.59040590405904, "grad_norm": 0.6655629449526129, "learning_rate": 1.427083471237213e-06, "loss": 0.1119, "num_tokens": 129097000.0, "step": 1168 }, { "epoch": 8.597785977859779, "grad_norm": 0.7752488403562106, "learning_rate": 1.4225028986658967e-06, "loss": 0.1134, "num_tokens": 129211370.0, "step": 1169 }, { "epoch": 8.605166051660516, "grad_norm": 0.6568649340461798, "learning_rate": 1.4179458123567677e-06, "loss": 0.0988, "num_tokens": 129314954.0, "step": 1170 }, { "epoch": 8.612546125461254, "grad_norm": 0.636569631920654, "learning_rate": 1.4134122385585092e-06, "loss": 0.1287, "num_tokens": 129433148.0, "step": 1171 }, { "epoch": 8.619926199261993, "grad_norm": 0.5776985376253863, "learning_rate": 1.4089022033843704e-06, "loss": 0.1003, "num_tokens": 129527036.0, "step": 1172 }, { "epoch": 8.62730627306273, "grad_norm": 0.6437073737800211, "learning_rate": 1.4044157328120208e-06, "loss": 0.1248, "num_tokens": 129626076.0, "step": 1173 }, { "epoch": 8.634686346863468, "grad_norm": 0.6120913429381284, "learning_rate": 1.3999528526833961e-06, "loss": 0.1252, "num_tokens": 129744041.0, "step": 1174 }, { "epoch": 8.642066420664207, "grad_norm": 0.692750541310882, "learning_rate": 1.3955135887045554e-06, "loss": 0.1085, "num_tokens": 129825567.0, "step": 1175 }, { "epoch": 8.649446494464945, "grad_norm": 0.6024998881516186, "learning_rate": 1.391097966445526e-06, "loss": 0.11, "num_tokens": 129957480.0, "step": 1176 }, { "epoch": 8.656826568265682, "grad_norm": 0.718906436566963, "learning_rate": 1.3867060113401618e-06, "loss": 0.1214, "num_tokens": 130078579.0, "step": 1177 }, { "epoch": 8.664206642066421, "grad_norm": 0.5810701455364705, "learning_rate": 1.382337748685993e-06, "loss": 0.1346, "num_tokens": 130191379.0, "step": 1178 }, { "epoch": 8.671586715867159, "grad_norm": 0.7278544423341184, "learning_rate": 1.377993203644083e-06, "loss": 0.1081, "num_tokens": 130280843.0, "step": 1179 }, { "epoch": 8.678966789667896, "grad_norm": 0.6592718069090931, "learning_rate": 1.3736724012388813e-06, "loss": 0.1055, "num_tokens": 130378047.0, "step": 1180 }, { "epoch": 8.686346863468636, "grad_norm": 0.7278188201434425, "learning_rate": 1.3693753663580834e-06, "loss": 0.1002, "num_tokens": 130497156.0, "step": 1181 }, { "epoch": 8.693726937269373, "grad_norm": 0.6022490730052281, "learning_rate": 1.3651021237524808e-06, "loss": 0.0942, "num_tokens": 130594120.0, "step": 1182 }, { "epoch": 8.70110701107011, "grad_norm": 0.6312445278566866, "learning_rate": 1.3608526980358245e-06, "loss": 0.1063, "num_tokens": 130698352.0, "step": 1183 }, { "epoch": 8.708487084870848, "grad_norm": 0.5726086122274623, "learning_rate": 1.3566271136846811e-06, "loss": 0.0885, "num_tokens": 130799947.0, "step": 1184 }, { "epoch": 8.715867158671587, "grad_norm": 0.6012335783882514, "learning_rate": 1.3524253950382904e-06, "loss": 0.1255, "num_tokens": 130925327.0, "step": 1185 }, { "epoch": 8.723247232472325, "grad_norm": 0.6150543976980224, "learning_rate": 1.3482475662984273e-06, "loss": 0.131, "num_tokens": 131041478.0, "step": 1186 }, { "epoch": 8.730627306273062, "grad_norm": 0.6554155180125351, "learning_rate": 1.3440936515292608e-06, "loss": 0.1106, "num_tokens": 131138021.0, "step": 1187 }, { "epoch": 8.738007380073801, "grad_norm": 0.569887429711919, "learning_rate": 1.3399636746572167e-06, "loss": 0.1124, "num_tokens": 131249631.0, "step": 1188 }, { "epoch": 8.745387453874539, "grad_norm": 0.5518376342567334, "learning_rate": 1.335857659470839e-06, "loss": 0.0887, "num_tokens": 131352987.0, "step": 1189 }, { "epoch": 8.752767527675276, "grad_norm": 0.6139149470107075, "learning_rate": 1.331775629620653e-06, "loss": 0.1144, "num_tokens": 131471711.0, "step": 1190 }, { "epoch": 8.760147601476016, "grad_norm": 0.6318161785490038, "learning_rate": 1.3277176086190296e-06, "loss": 0.1423, "num_tokens": 131575327.0, "step": 1191 }, { "epoch": 8.767527675276753, "grad_norm": 0.6197795416833018, "learning_rate": 1.3236836198400501e-06, "loss": 0.1422, "num_tokens": 131697862.0, "step": 1192 }, { "epoch": 8.77490774907749, "grad_norm": 0.59185234264299, "learning_rate": 1.3196736865193687e-06, "loss": 0.1351, "num_tokens": 131814771.0, "step": 1193 }, { "epoch": 8.782287822878228, "grad_norm": 0.6263246337780235, "learning_rate": 1.3156878317540835e-06, "loss": 0.1207, "num_tokens": 131936657.0, "step": 1194 }, { "epoch": 8.789667896678967, "grad_norm": 0.6182091169106162, "learning_rate": 1.3117260785025987e-06, "loss": 0.1132, "num_tokens": 132059032.0, "step": 1195 }, { "epoch": 8.797047970479705, "grad_norm": 0.5904336267135564, "learning_rate": 1.3077884495844956e-06, "loss": 0.1427, "num_tokens": 132158292.0, "step": 1196 }, { "epoch": 8.804428044280442, "grad_norm": 0.6131679209745687, "learning_rate": 1.3038749676803994e-06, "loss": 0.1015, "num_tokens": 132264400.0, "step": 1197 }, { "epoch": 8.811808118081181, "grad_norm": 0.5884127696383595, "learning_rate": 1.29998565533185e-06, "loss": 0.108, "num_tokens": 132352118.0, "step": 1198 }, { "epoch": 8.819188191881919, "grad_norm": 0.5703295517477384, "learning_rate": 1.296120534941171e-06, "loss": 0.11, "num_tokens": 132459362.0, "step": 1199 }, { "epoch": 8.826568265682656, "grad_norm": 0.607194949149482, "learning_rate": 1.2922796287713413e-06, "loss": 0.0988, "num_tokens": 132568274.0, "step": 1200 }, { "epoch": 8.833948339483396, "grad_norm": 0.6442517723656137, "learning_rate": 1.2884629589458653e-06, "loss": 0.0941, "num_tokens": 132662599.0, "step": 1201 }, { "epoch": 8.841328413284133, "grad_norm": 0.615550035098192, "learning_rate": 1.284670547448649e-06, "loss": 0.1251, "num_tokens": 132822501.0, "step": 1202 }, { "epoch": 8.84870848708487, "grad_norm": 0.5943999683591958, "learning_rate": 1.2809024161238699e-06, "loss": 0.1093, "num_tokens": 132927007.0, "step": 1203 }, { "epoch": 8.85608856088561, "grad_norm": 0.5551004930503878, "learning_rate": 1.277158586675852e-06, "loss": 0.1034, "num_tokens": 133036775.0, "step": 1204 }, { "epoch": 8.863468634686347, "grad_norm": 0.6596181804090301, "learning_rate": 1.2734390806689422e-06, "loss": 0.1145, "num_tokens": 133142874.0, "step": 1205 }, { "epoch": 8.870848708487085, "grad_norm": 0.6267524284467567, "learning_rate": 1.269743919527384e-06, "loss": 0.0971, "num_tokens": 133265583.0, "step": 1206 }, { "epoch": 8.878228782287822, "grad_norm": 0.583583838659581, "learning_rate": 1.2660731245351962e-06, "loss": 0.1193, "num_tokens": 133387180.0, "step": 1207 }, { "epoch": 8.885608856088561, "grad_norm": 0.6111191321370599, "learning_rate": 1.2624267168360479e-06, "loss": 0.0918, "num_tokens": 133474575.0, "step": 1208 }, { "epoch": 8.892988929889299, "grad_norm": 0.5929732389499657, "learning_rate": 1.2588047174331417e-06, "loss": 0.1459, "num_tokens": 133631311.0, "step": 1209 }, { "epoch": 8.900369003690036, "grad_norm": 0.7561200036170181, "learning_rate": 1.2552071471890839e-06, "loss": 0.096, "num_tokens": 133716163.0, "step": 1210 }, { "epoch": 8.907749077490775, "grad_norm": 0.5415207972997961, "learning_rate": 1.2516340268257737e-06, "loss": 0.0854, "num_tokens": 133827950.0, "step": 1211 }, { "epoch": 8.915129151291513, "grad_norm": 0.7459812398595722, "learning_rate": 1.248085376924278e-06, "loss": 0.1236, "num_tokens": 133927878.0, "step": 1212 }, { "epoch": 8.92250922509225, "grad_norm": 0.634841479109334, "learning_rate": 1.2445612179247147e-06, "loss": 0.0921, "num_tokens": 134020539.0, "step": 1213 }, { "epoch": 8.92988929889299, "grad_norm": 0.6858286771716869, "learning_rate": 1.2410615701261342e-06, "loss": 0.1105, "num_tokens": 134122150.0, "step": 1214 }, { "epoch": 8.937269372693727, "grad_norm": 0.6201594087632721, "learning_rate": 1.2375864536864055e-06, "loss": 0.116, "num_tokens": 134251378.0, "step": 1215 }, { "epoch": 8.944649446494465, "grad_norm": 0.6048767594107017, "learning_rate": 1.2341358886220942e-06, "loss": 0.1395, "num_tokens": 134358760.0, "step": 1216 }, { "epoch": 8.952029520295202, "grad_norm": 0.577285117710034, "learning_rate": 1.2307098948083538e-06, "loss": 0.1262, "num_tokens": 134509061.0, "step": 1217 }, { "epoch": 8.959409594095941, "grad_norm": 0.576726954166539, "learning_rate": 1.2273084919788066e-06, "loss": 0.0936, "num_tokens": 134588340.0, "step": 1218 }, { "epoch": 8.966789667896679, "grad_norm": 0.5930291051290446, "learning_rate": 1.2239316997254328e-06, "loss": 0.0821, "num_tokens": 134689789.0, "step": 1219 }, { "epoch": 8.974169741697416, "grad_norm": 0.5702699250144813, "learning_rate": 1.220579537498454e-06, "loss": 0.0815, "num_tokens": 134783139.0, "step": 1220 }, { "epoch": 8.981549815498155, "grad_norm": 0.5278088449176532, "learning_rate": 1.2172520246062257e-06, "loss": 0.1046, "num_tokens": 134893686.0, "step": 1221 }, { "epoch": 8.988929889298893, "grad_norm": 0.6068798117027167, "learning_rate": 1.2139491802151235e-06, "loss": 0.1081, "num_tokens": 135012267.0, "step": 1222 }, { "epoch": 8.99630996309963, "grad_norm": 0.6735049740696725, "learning_rate": 1.2106710233494326e-06, "loss": 0.1339, "num_tokens": 135111751.0, "step": 1223 }, { "epoch": 9.0, "grad_norm": 0.6735049740696725, "learning_rate": 1.2074175728912397e-06, "loss": 0.1233, "num_tokens": 135169229.0, "step": 1224 }, { "epoch": 9.007380073800737, "grad_norm": 1.098013608941204, "learning_rate": 1.2041888475803217e-06, "loss": 0.1049, "num_tokens": 135282785.0, "step": 1225 }, { "epoch": 9.014760147601477, "grad_norm": 0.5532199525343341, "learning_rate": 1.200984866014041e-06, "loss": 0.0718, "num_tokens": 135399869.0, "step": 1226 }, { "epoch": 9.022140221402214, "grad_norm": 0.40362892252012594, "learning_rate": 1.1978056466472373e-06, "loss": 0.0798, "num_tokens": 135500867.0, "step": 1227 }, { "epoch": 9.029520295202952, "grad_norm": 0.43778480941123277, "learning_rate": 1.1946512077921186e-06, "loss": 0.0692, "num_tokens": 135656577.0, "step": 1228 }, { "epoch": 9.03690036900369, "grad_norm": 0.43434488006320376, "learning_rate": 1.1915215676181597e-06, "loss": 0.0769, "num_tokens": 135758420.0, "step": 1229 }, { "epoch": 9.044280442804428, "grad_norm": 0.4740122911856107, "learning_rate": 1.1884167441519944e-06, "loss": 0.0936, "num_tokens": 135852458.0, "step": 1230 }, { "epoch": 9.051660516605166, "grad_norm": 0.5561195948834813, "learning_rate": 1.1853367552773136e-06, "loss": 0.0773, "num_tokens": 135946524.0, "step": 1231 }, { "epoch": 9.059040590405903, "grad_norm": 0.6173578371571918, "learning_rate": 1.1822816187347625e-06, "loss": 0.0913, "num_tokens": 136060410.0, "step": 1232 }, { "epoch": 9.066420664206642, "grad_norm": 0.5668208716319202, "learning_rate": 1.1792513521218355e-06, "loss": 0.093, "num_tokens": 136161844.0, "step": 1233 }, { "epoch": 9.07380073800738, "grad_norm": 0.4862826369270734, "learning_rate": 1.1762459728927795e-06, "loss": 0.0906, "num_tokens": 136254874.0, "step": 1234 }, { "epoch": 9.081180811808117, "grad_norm": 0.6892251066526217, "learning_rate": 1.1732654983584896e-06, "loss": 0.0657, "num_tokens": 136349326.0, "step": 1235 }, { "epoch": 9.088560885608857, "grad_norm": 0.4569367554195257, "learning_rate": 1.1703099456864097e-06, "loss": 0.1044, "num_tokens": 136480736.0, "step": 1236 }, { "epoch": 9.095940959409594, "grad_norm": 0.7440492203877666, "learning_rate": 1.1673793319004364e-06, "loss": 0.0722, "num_tokens": 136584305.0, "step": 1237 }, { "epoch": 9.103321033210332, "grad_norm": 0.5359395869253063, "learning_rate": 1.1644736738808176e-06, "loss": 0.0859, "num_tokens": 136663948.0, "step": 1238 }, { "epoch": 9.11070110701107, "grad_norm": 0.7091484523231024, "learning_rate": 1.1615929883640569e-06, "loss": 0.2368, "num_tokens": 136814952.0, "step": 1239 }, { "epoch": 9.118081180811808, "grad_norm": 0.5697829422677767, "learning_rate": 1.1587372919428174e-06, "loss": 0.1117, "num_tokens": 136947100.0, "step": 1240 }, { "epoch": 9.125461254612546, "grad_norm": 0.48117422623071765, "learning_rate": 1.1559066010658262e-06, "loss": 0.0748, "num_tokens": 137063091.0, "step": 1241 }, { "epoch": 9.132841328413285, "grad_norm": 0.5387899819580395, "learning_rate": 1.1531009320377783e-06, "loss": 0.092, "num_tokens": 137183371.0, "step": 1242 }, { "epoch": 9.140221402214022, "grad_norm": 0.5058630416412286, "learning_rate": 1.1503203010192432e-06, "loss": 0.0731, "num_tokens": 137305263.0, "step": 1243 }, { "epoch": 9.14760147601476, "grad_norm": 0.5168951180662867, "learning_rate": 1.1475647240265746e-06, "loss": 0.0879, "num_tokens": 137410135.0, "step": 1244 }, { "epoch": 9.154981549815497, "grad_norm": 0.5688618299675912, "learning_rate": 1.144834216931813e-06, "loss": 0.134, "num_tokens": 137546953.0, "step": 1245 }, { "epoch": 9.162361623616237, "grad_norm": 0.5328692971375418, "learning_rate": 1.1421287954625988e-06, "loss": 0.0958, "num_tokens": 137634130.0, "step": 1246 }, { "epoch": 9.169741697416974, "grad_norm": 0.5054910562846227, "learning_rate": 1.1394484752020784e-06, "loss": 0.0765, "num_tokens": 137736988.0, "step": 1247 }, { "epoch": 9.177121771217712, "grad_norm": 0.4933553344922545, "learning_rate": 1.1367932715888178e-06, "loss": 0.0692, "num_tokens": 137828899.0, "step": 1248 }, { "epoch": 9.18450184501845, "grad_norm": 0.7450842938914303, "learning_rate": 1.1341631999167104e-06, "loss": 0.0865, "num_tokens": 137946709.0, "step": 1249 }, { "epoch": 9.191881918819188, "grad_norm": 0.4473386593384036, "learning_rate": 1.131558275334891e-06, "loss": 0.0823, "num_tokens": 138028535.0, "step": 1250 }, { "epoch": 9.199261992619926, "grad_norm": 0.47492406258889297, "learning_rate": 1.1289785128476476e-06, "loss": 0.0942, "num_tokens": 138134748.0, "step": 1251 }, { "epoch": 9.206642066420665, "grad_norm": 0.4362471317660537, "learning_rate": 1.1264239273143356e-06, "loss": 0.0869, "num_tokens": 138265869.0, "step": 1252 }, { "epoch": 9.214022140221402, "grad_norm": 0.5772043869677019, "learning_rate": 1.1238945334492929e-06, "loss": 0.0778, "num_tokens": 138352099.0, "step": 1253 }, { "epoch": 9.22140221402214, "grad_norm": 0.4873687523841293, "learning_rate": 1.1213903458217511e-06, "loss": 0.0838, "num_tokens": 138480445.0, "step": 1254 }, { "epoch": 9.228782287822877, "grad_norm": 0.5725491626503895, "learning_rate": 1.1189113788557584e-06, "loss": 0.0685, "num_tokens": 138564835.0, "step": 1255 }, { "epoch": 9.236162361623617, "grad_norm": 0.5378442811206418, "learning_rate": 1.1164576468300897e-06, "loss": 0.0864, "num_tokens": 138679521.0, "step": 1256 }, { "epoch": 9.243542435424354, "grad_norm": 0.4831890277582963, "learning_rate": 1.114029163878169e-06, "loss": 0.0646, "num_tokens": 138766817.0, "step": 1257 }, { "epoch": 9.250922509225092, "grad_norm": 0.5119863473235919, "learning_rate": 1.1116259439879859e-06, "loss": 0.0905, "num_tokens": 138869322.0, "step": 1258 }, { "epoch": 9.25830258302583, "grad_norm": 0.6275481878257461, "learning_rate": 1.1092480010020154e-06, "loss": 0.07, "num_tokens": 138941411.0, "step": 1259 }, { "epoch": 9.265682656826568, "grad_norm": 0.621644017095464, "learning_rate": 1.1068953486171387e-06, "loss": 0.1209, "num_tokens": 139073677.0, "step": 1260 }, { "epoch": 9.273062730627306, "grad_norm": 0.6410095734349077, "learning_rate": 1.1045680003845635e-06, "loss": 0.5559, "num_tokens": 139213583.0, "step": 1261 }, { "epoch": 9.280442804428045, "grad_norm": 0.4791056304666212, "learning_rate": 1.1022659697097466e-06, "loss": 0.1183, "num_tokens": 139315732.0, "step": 1262 }, { "epoch": 9.287822878228782, "grad_norm": 0.6139676068282646, "learning_rate": 1.099989269852317e-06, "loss": 0.0826, "num_tokens": 139406775.0, "step": 1263 }, { "epoch": 9.29520295202952, "grad_norm": 0.5727542877358448, "learning_rate": 1.0977379139259968e-06, "loss": 0.0715, "num_tokens": 139490445.0, "step": 1264 }, { "epoch": 9.302583025830259, "grad_norm": 0.4255032842147736, "learning_rate": 1.0955119148985302e-06, "loss": 0.0752, "num_tokens": 139618468.0, "step": 1265 }, { "epoch": 9.309963099630997, "grad_norm": 0.5139540382949935, "learning_rate": 1.0933112855916057e-06, "loss": 0.0916, "num_tokens": 139694374.0, "step": 1266 }, { "epoch": 9.317343173431734, "grad_norm": 0.7147664067984917, "learning_rate": 1.0911360386807814e-06, "loss": 0.1061, "num_tokens": 139827843.0, "step": 1267 }, { "epoch": 9.324723247232471, "grad_norm": 0.6217915110979024, "learning_rate": 1.0889861866954165e-06, "loss": 0.0628, "num_tokens": 139932395.0, "step": 1268 }, { "epoch": 9.33210332103321, "grad_norm": 0.47302488760885186, "learning_rate": 1.0868617420185935e-06, "loss": 0.0694, "num_tokens": 140044988.0, "step": 1269 }, { "epoch": 9.339483394833948, "grad_norm": 0.4898432241818497, "learning_rate": 1.084762716887051e-06, "loss": 0.0637, "num_tokens": 140143951.0, "step": 1270 }, { "epoch": 9.346863468634686, "grad_norm": 0.45565681545242087, "learning_rate": 1.0826891233911122e-06, "loss": 0.1, "num_tokens": 140289486.0, "step": 1271 }, { "epoch": 9.354243542435425, "grad_norm": 0.49835460213670446, "learning_rate": 1.0806409734746128e-06, "loss": 0.1157, "num_tokens": 140452034.0, "step": 1272 }, { "epoch": 9.361623616236162, "grad_norm": 0.5586254064308035, "learning_rate": 1.0786182789348357e-06, "loss": 0.0772, "num_tokens": 140559290.0, "step": 1273 }, { "epoch": 9.3690036900369, "grad_norm": 0.36499301388851546, "learning_rate": 1.076621051422442e-06, "loss": 0.0707, "num_tokens": 140648021.0, "step": 1274 }, { "epoch": 9.376383763837639, "grad_norm": 0.6315911456891137, "learning_rate": 1.0746493024414028e-06, "loss": 0.0701, "num_tokens": 140760770.0, "step": 1275 }, { "epoch": 9.383763837638377, "grad_norm": 0.4439501077654831, "learning_rate": 1.0727030433489331e-06, "loss": 0.0839, "num_tokens": 140837990.0, "step": 1276 }, { "epoch": 9.391143911439114, "grad_norm": 0.6294163873714306, "learning_rate": 1.0707822853554275e-06, "loss": 0.067, "num_tokens": 140916144.0, "step": 1277 }, { "epoch": 9.398523985239853, "grad_norm": 0.47592531161352836, "learning_rate": 1.068887039524395e-06, "loss": 0.0898, "num_tokens": 141012735.0, "step": 1278 }, { "epoch": 9.40590405904059, "grad_norm": 0.5731828438615216, "learning_rate": 1.067017316772396e-06, "loss": 0.0842, "num_tokens": 141145409.0, "step": 1279 }, { "epoch": 9.413284132841328, "grad_norm": 0.5369622337944866, "learning_rate": 1.0651731278689773e-06, "loss": 0.0999, "num_tokens": 141293141.0, "step": 1280 }, { "epoch": 9.420664206642066, "grad_norm": 0.5210810547850394, "learning_rate": 1.0633544834366125e-06, "loss": 0.0905, "num_tokens": 141424378.0, "step": 1281 }, { "epoch": 9.428044280442805, "grad_norm": 0.5441370628754586, "learning_rate": 1.0615613939506392e-06, "loss": 0.0573, "num_tokens": 141534164.0, "step": 1282 }, { "epoch": 9.435424354243542, "grad_norm": 0.4882480772188857, "learning_rate": 1.0597938697392002e-06, "loss": 0.0863, "num_tokens": 141658150.0, "step": 1283 }, { "epoch": 9.44280442804428, "grad_norm": 0.5744920846110795, "learning_rate": 1.0580519209831818e-06, "loss": 0.067, "num_tokens": 141766293.0, "step": 1284 }, { "epoch": 9.450184501845019, "grad_norm": 0.4094733363503645, "learning_rate": 1.0563355577161578e-06, "loss": 0.0754, "num_tokens": 141860600.0, "step": 1285 }, { "epoch": 9.457564575645756, "grad_norm": 0.5493452770826364, "learning_rate": 1.0546447898243282e-06, "loss": 0.0796, "num_tokens": 141953505.0, "step": 1286 }, { "epoch": 9.464944649446494, "grad_norm": 0.5578214877769143, "learning_rate": 1.0529796270464674e-06, "loss": 0.0548, "num_tokens": 142055360.0, "step": 1287 }, { "epoch": 9.472324723247233, "grad_norm": 0.49089515513264637, "learning_rate": 1.0513400789738631e-06, "loss": 0.0899, "num_tokens": 142146667.0, "step": 1288 }, { "epoch": 9.47970479704797, "grad_norm": 0.6918407979557708, "learning_rate": 1.0497261550502631e-06, "loss": 0.0858, "num_tokens": 142230862.0, "step": 1289 }, { "epoch": 9.487084870848708, "grad_norm": 0.49566833014774225, "learning_rate": 1.0481378645718215e-06, "loss": 0.0925, "num_tokens": 142362429.0, "step": 1290 }, { "epoch": 9.494464944649447, "grad_norm": 0.485006343857991, "learning_rate": 1.0465752166870445e-06, "loss": 0.073, "num_tokens": 142454056.0, "step": 1291 }, { "epoch": 9.501845018450185, "grad_norm": 0.5801596165920954, "learning_rate": 1.0450382203967372e-06, "loss": 0.0774, "num_tokens": 142565299.0, "step": 1292 }, { "epoch": 9.509225092250922, "grad_norm": 0.529155775721648, "learning_rate": 1.043526884553953e-06, "loss": 0.0638, "num_tokens": 142659189.0, "step": 1293 }, { "epoch": 9.51660516605166, "grad_norm": 0.4651717505933552, "learning_rate": 1.0420412178639408e-06, "loss": 0.0918, "num_tokens": 142785942.0, "step": 1294 }, { "epoch": 9.523985239852399, "grad_norm": 0.5415557009247054, "learning_rate": 1.0405812288840967e-06, "loss": 0.1002, "num_tokens": 142905415.0, "step": 1295 }, { "epoch": 9.531365313653136, "grad_norm": 0.5857032868286894, "learning_rate": 1.0391469260239146e-06, "loss": 0.1, "num_tokens": 143046818.0, "step": 1296 }, { "epoch": 9.538745387453874, "grad_norm": 0.5447518796281599, "learning_rate": 1.037738317544936e-06, "loss": 0.0913, "num_tokens": 143154055.0, "step": 1297 }, { "epoch": 9.546125461254613, "grad_norm": 0.5125117807248399, "learning_rate": 1.036355411560703e-06, "loss": 0.0651, "num_tokens": 143263376.0, "step": 1298 }, { "epoch": 9.55350553505535, "grad_norm": 0.48226465374189814, "learning_rate": 1.0349982160367146e-06, "loss": 0.0635, "num_tokens": 143363961.0, "step": 1299 }, { "epoch": 9.560885608856088, "grad_norm": 0.4642815323636672, "learning_rate": 1.0336667387903755e-06, "loss": 0.072, "num_tokens": 143479639.0, "step": 1300 }, { "epoch": 9.568265682656827, "grad_norm": 0.6511448364627623, "learning_rate": 1.0323609874909552e-06, "loss": 0.0861, "num_tokens": 143564742.0, "step": 1301 }, { "epoch": 9.575645756457565, "grad_norm": 0.49420521238167936, "learning_rate": 1.0310809696595431e-06, "loss": 0.081, "num_tokens": 143660847.0, "step": 1302 }, { "epoch": 9.583025830258302, "grad_norm": 0.6949716240687748, "learning_rate": 1.029826692669003e-06, "loss": 0.0969, "num_tokens": 143767931.0, "step": 1303 }, { "epoch": 9.59040590405904, "grad_norm": 0.6194488779322029, "learning_rate": 1.028598163743934e-06, "loss": 0.1239, "num_tokens": 143909274.0, "step": 1304 }, { "epoch": 9.597785977859779, "grad_norm": 0.44564646128264457, "learning_rate": 1.0273953899606256e-06, "loss": 0.0949, "num_tokens": 144015506.0, "step": 1305 }, { "epoch": 9.605166051660516, "grad_norm": 0.5191765591583786, "learning_rate": 1.0262183782470191e-06, "loss": 0.1157, "num_tokens": 144142602.0, "step": 1306 }, { "epoch": 9.612546125461254, "grad_norm": 0.6760532101276266, "learning_rate": 1.025067135382667e-06, "loss": 0.136, "num_tokens": 144257871.0, "step": 1307 }, { "epoch": 9.619926199261993, "grad_norm": 0.5689254661964385, "learning_rate": 1.0239416679986947e-06, "loss": 0.0731, "num_tokens": 144385398.0, "step": 1308 }, { "epoch": 9.62730627306273, "grad_norm": 0.525375097114315, "learning_rate": 1.0228419825777603e-06, "loss": 0.0893, "num_tokens": 144473651.0, "step": 1309 }, { "epoch": 9.634686346863468, "grad_norm": 0.546320906132776, "learning_rate": 1.021768085454019e-06, "loss": 0.0915, "num_tokens": 144586199.0, "step": 1310 }, { "epoch": 9.642066420664207, "grad_norm": 0.7454288817246733, "learning_rate": 1.0207199828130867e-06, "loss": 0.0808, "num_tokens": 144672354.0, "step": 1311 }, { "epoch": 9.649446494464945, "grad_norm": 0.5691877153607596, "learning_rate": 1.0196976806920026e-06, "loss": 0.1098, "num_tokens": 144799218.0, "step": 1312 }, { "epoch": 9.656826568265682, "grad_norm": 0.5613847781133986, "learning_rate": 1.018701184979198e-06, "loss": 0.1433, "num_tokens": 144932569.0, "step": 1313 }, { "epoch": 9.664206642066421, "grad_norm": 0.7075961001446242, "learning_rate": 1.0177305014144579e-06, "loss": 0.1632, "num_tokens": 145094329.0, "step": 1314 }, { "epoch": 9.671586715867159, "grad_norm": 0.4949014137778886, "learning_rate": 1.0167856355888906e-06, "loss": 0.0847, "num_tokens": 145200137.0, "step": 1315 }, { "epoch": 9.678966789667896, "grad_norm": 0.5777610491503402, "learning_rate": 1.0158665929448951e-06, "loss": 0.0783, "num_tokens": 145299005.0, "step": 1316 }, { "epoch": 9.686346863468636, "grad_norm": 0.5666895663706422, "learning_rate": 1.0149733787761306e-06, "loss": 0.1045, "num_tokens": 145417438.0, "step": 1317 }, { "epoch": 9.693726937269373, "grad_norm": 0.4976340719962353, "learning_rate": 1.0141059982274833e-06, "loss": 0.0977, "num_tokens": 145527495.0, "step": 1318 }, { "epoch": 9.70110701107011, "grad_norm": 0.5527340720451464, "learning_rate": 1.0132644562950395e-06, "loss": 0.0913, "num_tokens": 145660524.0, "step": 1319 }, { "epoch": 9.708487084870848, "grad_norm": 0.5971966074249265, "learning_rate": 1.0124487578260562e-06, "loss": 0.1005, "num_tokens": 145804173.0, "step": 1320 }, { "epoch": 9.715867158671587, "grad_norm": 0.5695738798252161, "learning_rate": 1.011658907518932e-06, "loss": 0.0968, "num_tokens": 145912414.0, "step": 1321 }, { "epoch": 9.723247232472325, "grad_norm": 0.5672640649845705, "learning_rate": 1.010894909923181e-06, "loss": 0.073, "num_tokens": 146020616.0, "step": 1322 }, { "epoch": 9.730627306273062, "grad_norm": 0.5872464338893862, "learning_rate": 1.0101567694394073e-06, "loss": 0.1033, "num_tokens": 146111381.0, "step": 1323 }, { "epoch": 9.738007380073801, "grad_norm": 0.6153149906400658, "learning_rate": 1.0094444903192775e-06, "loss": 0.0896, "num_tokens": 146231059.0, "step": 1324 }, { "epoch": 9.745387453874539, "grad_norm": 0.511058399196522, "learning_rate": 1.0087580766654983e-06, "loss": 0.0763, "num_tokens": 146328963.0, "step": 1325 }, { "epoch": 9.752767527675276, "grad_norm": 0.49146225377027963, "learning_rate": 1.0080975324317925e-06, "loss": 0.082, "num_tokens": 146416240.0, "step": 1326 }, { "epoch": 9.760147601476016, "grad_norm": 0.7063177174505592, "learning_rate": 1.0074628614228752e-06, "loss": 0.1369, "num_tokens": 146571417.0, "step": 1327 }, { "epoch": 9.767527675276753, "grad_norm": 0.5173878371456873, "learning_rate": 1.0068540672944318e-06, "loss": 0.4172, "num_tokens": 146710002.0, "step": 1328 }, { "epoch": 9.77490774907749, "grad_norm": 0.5063782782555744, "learning_rate": 1.0062711535530988e-06, "loss": 0.0717, "num_tokens": 146805464.0, "step": 1329 }, { "epoch": 9.782287822878228, "grad_norm": 0.5229716807346101, "learning_rate": 1.0057141235564425e-06, "loss": 0.0787, "num_tokens": 146938300.0, "step": 1330 }, { "epoch": 9.789667896678967, "grad_norm": 0.4765636960934646, "learning_rate": 1.005182980512938e-06, "loss": 0.0759, "num_tokens": 147084703.0, "step": 1331 }, { "epoch": 9.797047970479705, "grad_norm": 0.6123724015309612, "learning_rate": 1.0046777274819546e-06, "loss": 0.0992, "num_tokens": 147198672.0, "step": 1332 }, { "epoch": 9.804428044280442, "grad_norm": 0.5011853630785137, "learning_rate": 1.0041983673737344e-06, "loss": 0.0793, "num_tokens": 147296656.0, "step": 1333 }, { "epoch": 9.811808118081181, "grad_norm": 0.5179966656872552, "learning_rate": 1.0037449029493772e-06, "loss": 0.0712, "num_tokens": 147385700.0, "step": 1334 }, { "epoch": 9.819188191881919, "grad_norm": 0.5532694050161349, "learning_rate": 1.0033173368208247e-06, "loss": 0.0688, "num_tokens": 147492247.0, "step": 1335 }, { "epoch": 9.826568265682656, "grad_norm": 0.509334276535671, "learning_rate": 1.0029156714508453e-06, "loss": 0.0716, "num_tokens": 147604983.0, "step": 1336 }, { "epoch": 9.833948339483396, "grad_norm": 0.6889262547462951, "learning_rate": 1.0025399091530194e-06, "loss": 0.1197, "num_tokens": 147731969.0, "step": 1337 }, { "epoch": 9.841328413284133, "grad_norm": 0.481321432509424, "learning_rate": 1.0021900520917265e-06, "loss": 0.0795, "num_tokens": 147846280.0, "step": 1338 }, { "epoch": 9.84870848708487, "grad_norm": 0.4948805761127963, "learning_rate": 1.001866102282133e-06, "loss": 0.1026, "num_tokens": 148000531.0, "step": 1339 }, { "epoch": 9.85608856088561, "grad_norm": 0.5472075583755517, "learning_rate": 1.0015680615901803e-06, "loss": 0.0928, "num_tokens": 148099330.0, "step": 1340 }, { "epoch": 9.863468634686347, "grad_norm": 0.5123734981559245, "learning_rate": 1.0012959317325742e-06, "loss": 0.0863, "num_tokens": 148216192.0, "step": 1341 }, { "epoch": 9.870848708487085, "grad_norm": 0.41053993635286284, "learning_rate": 1.0010497142767739e-06, "loss": 0.095, "num_tokens": 148329601.0, "step": 1342 }, { "epoch": 9.878228782287822, "grad_norm": 0.5933041621898189, "learning_rate": 1.0008294106409856e-06, "loss": 0.628, "num_tokens": 148460623.0, "step": 1343 }, { "epoch": 9.885608856088561, "grad_norm": 0.5494886437683918, "learning_rate": 1.0006350220941502e-06, "loss": 0.0967, "num_tokens": 148563652.0, "step": 1344 }, { "epoch": 9.892988929889299, "grad_norm": 0.5346463232904093, "learning_rate": 1.0004665497559418e-06, "loss": 0.095, "num_tokens": 148656680.0, "step": 1345 }, { "epoch": 9.900369003690036, "grad_norm": 0.5253737724244492, "learning_rate": 1.0003239945967546e-06, "loss": 0.0884, "num_tokens": 148754060.0, "step": 1346 }, { "epoch": 9.907749077490775, "grad_norm": 0.6504020017793467, "learning_rate": 1.0002073574377025e-06, "loss": 0.0869, "num_tokens": 148830389.0, "step": 1347 }, { "epoch": 9.915129151291513, "grad_norm": 0.5903039340675148, "learning_rate": 1.0001166389506125e-06, "loss": 0.0853, "num_tokens": 148922650.0, "step": 1348 }, { "epoch": 9.92250922509225, "grad_norm": 0.625114328906451, "learning_rate": 1.0000518396580204e-06, "loss": 0.0815, "num_tokens": 149031999.0, "step": 1349 }, { "epoch": 9.92988929889299, "grad_norm": 0.5694763430053924, "learning_rate": 1.0000129599331674e-06, "loss": 0.0649, "num_tokens": 149130555.0, "step": 1350 }, { "epoch": 9.92988929889299, "step": 1350, "total_flos": 8.903987464733983e+18, "train_loss": 0.4533373955609622, "train_runtime": 14911.2824, "train_samples_per_second": 11.626, "train_steps_per_second": 0.091 } ], "logging_steps": 1, "max_steps": 1350, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.903987464733983e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }