diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10843 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.92988929889299, + "eval_steps": 500, + "global_step": 1350, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007380073800738007, + "grad_norm": 6.363606224312279, + "learning_rate": 0.0, + "loss": 1.4914, + "num_tokens": 108129.0, + "step": 1 + }, + { + "epoch": 0.014760147601476014, + "grad_norm": 6.012702836892834, + "learning_rate": 2.439024390243903e-07, + "loss": 1.4111, + "num_tokens": 262871.0, + "step": 2 + }, + { + "epoch": 0.02214022140221402, + "grad_norm": 6.615048103988674, + "learning_rate": 4.878048780487805e-07, + "loss": 1.525, + "num_tokens": 383320.0, + "step": 3 + }, + { + "epoch": 0.02952029520295203, + "grad_norm": 6.205874266530899, + "learning_rate": 7.317073170731707e-07, + "loss": 1.5202, + "num_tokens": 486579.0, + "step": 4 + }, + { + "epoch": 0.03690036900369004, + "grad_norm": 6.790213435426661, + "learning_rate": 9.75609756097561e-07, + "loss": 1.5711, + "num_tokens": 576134.0, + "step": 5 + }, + { + "epoch": 0.04428044280442804, + "grad_norm": 6.316668195961204, + "learning_rate": 1.2195121951219514e-06, + "loss": 1.4689, + "num_tokens": 686509.0, + "step": 6 + }, + { + "epoch": 0.05166051660516605, + "grad_norm": 5.938905378037534, + "learning_rate": 1.4634146341463414e-06, + "loss": 1.4945, + "num_tokens": 784231.0, + "step": 7 + }, + { + "epoch": 0.05904059040590406, + "grad_norm": 4.429329163711381, + "learning_rate": 1.707317073170732e-06, + "loss": 1.5244, + "num_tokens": 882871.0, + "step": 8 + }, + { + "epoch": 0.06642066420664207, + "grad_norm": 4.195021375348304, + "learning_rate": 1.951219512195122e-06, + "loss": 1.4425, + "num_tokens": 1005895.0, + "step": 9 + }, + { + "epoch": 0.07380073800738007, + "grad_norm": 2.885978260862124, + "learning_rate": 2.1951219512195125e-06, + "loss": 1.4326, + "num_tokens": 1105351.0, + "step": 10 + }, + { + "epoch": 0.08118081180811808, + "grad_norm": 2.573831231053168, + "learning_rate": 2.4390243902439027e-06, + "loss": 1.3451, + "num_tokens": 1226716.0, + "step": 11 + }, + { + "epoch": 0.08856088560885608, + "grad_norm": 2.146494562191251, + "learning_rate": 2.682926829268293e-06, + "loss": 1.3703, + "num_tokens": 1353599.0, + "step": 12 + }, + { + "epoch": 0.0959409594095941, + "grad_norm": 1.5117677326527352, + "learning_rate": 2.926829268292683e-06, + "loss": 1.171, + "num_tokens": 1461850.0, + "step": 13 + }, + { + "epoch": 0.1033210332103321, + "grad_norm": 2.5271220924968527, + "learning_rate": 3.1707317073170736e-06, + "loss": 1.2756, + "num_tokens": 1558756.0, + "step": 14 + }, + { + "epoch": 0.11070110701107011, + "grad_norm": 1.838325022665236, + "learning_rate": 3.414634146341464e-06, + "loss": 1.1876, + "num_tokens": 1663867.0, + "step": 15 + }, + { + "epoch": 0.11808118081180811, + "grad_norm": 1.8568958719583475, + "learning_rate": 3.6585365853658537e-06, + "loss": 1.2399, + "num_tokens": 1766057.0, + "step": 16 + }, + { + "epoch": 0.12546125461254612, + "grad_norm": 1.3695578003158202, + "learning_rate": 3.902439024390244e-06, + "loss": 1.23, + "num_tokens": 1869252.0, + "step": 17 + }, + { + "epoch": 0.13284132841328414, + "grad_norm": 1.153225570030305, + "learning_rate": 4.146341463414634e-06, + "loss": 1.1294, + "num_tokens": 1957924.0, + "step": 18 + }, + { + "epoch": 0.14022140221402213, + "grad_norm": 0.7456771122232927, + "learning_rate": 4.390243902439025e-06, + "loss": 0.9469, + "num_tokens": 2097152.0, + "step": 19 + }, + { + "epoch": 0.14760147601476015, + "grad_norm": 1.1465877970965435, + "learning_rate": 4.634146341463416e-06, + "loss": 1.1601, + "num_tokens": 2226077.0, + "step": 20 + }, + { + "epoch": 0.15498154981549817, + "grad_norm": 0.7815788812110701, + "learning_rate": 4.8780487804878055e-06, + "loss": 1.0533, + "num_tokens": 2363480.0, + "step": 21 + }, + { + "epoch": 0.16236162361623616, + "grad_norm": 0.7732486130250715, + "learning_rate": 5.121951219512195e-06, + "loss": 1.5139, + "num_tokens": 2543914.0, + "step": 22 + }, + { + "epoch": 0.16974169741697417, + "grad_norm": 0.7060278478886041, + "learning_rate": 5.365853658536586e-06, + "loss": 1.0774, + "num_tokens": 2677305.0, + "step": 23 + }, + { + "epoch": 0.17712177121771217, + "grad_norm": 0.8930074024117203, + "learning_rate": 5.609756097560977e-06, + "loss": 1.1666, + "num_tokens": 2774583.0, + "step": 24 + }, + { + "epoch": 0.18450184501845018, + "grad_norm": 0.7498436046258535, + "learning_rate": 5.853658536585366e-06, + "loss": 1.1053, + "num_tokens": 2883688.0, + "step": 25 + }, + { + "epoch": 0.1918819188191882, + "grad_norm": 0.8912766185518683, + "learning_rate": 6.0975609756097564e-06, + "loss": 1.13, + "num_tokens": 2995769.0, + "step": 26 + }, + { + "epoch": 0.1992619926199262, + "grad_norm": 0.9011384648250562, + "learning_rate": 6.341463414634147e-06, + "loss": 1.0441, + "num_tokens": 3082040.0, + "step": 27 + }, + { + "epoch": 0.2066420664206642, + "grad_norm": 0.8737960258241075, + "learning_rate": 6.585365853658538e-06, + "loss": 1.1585, + "num_tokens": 3175114.0, + "step": 28 + }, + { + "epoch": 0.2140221402214022, + "grad_norm": 0.7296535111218418, + "learning_rate": 6.829268292682928e-06, + "loss": 1.0948, + "num_tokens": 3285127.0, + "step": 29 + }, + { + "epoch": 0.22140221402214022, + "grad_norm": 0.7274693928818707, + "learning_rate": 7.0731707317073175e-06, + "loss": 1.0835, + "num_tokens": 3380009.0, + "step": 30 + }, + { + "epoch": 0.22878228782287824, + "grad_norm": 0.6088521526135358, + "learning_rate": 7.317073170731707e-06, + "loss": 1.0209, + "num_tokens": 3535375.0, + "step": 31 + }, + { + "epoch": 0.23616236162361623, + "grad_norm": 0.7018449321520274, + "learning_rate": 7.560975609756098e-06, + "loss": 1.046, + "num_tokens": 3634648.0, + "step": 32 + }, + { + "epoch": 0.24354243542435425, + "grad_norm": 0.6536374431603196, + "learning_rate": 7.804878048780489e-06, + "loss": 1.1203, + "num_tokens": 3739441.0, + "step": 33 + }, + { + "epoch": 0.25092250922509224, + "grad_norm": 0.51602945023754, + "learning_rate": 8.048780487804879e-06, + "loss": 1.1216, + "num_tokens": 3901324.0, + "step": 34 + }, + { + "epoch": 0.25830258302583026, + "grad_norm": 0.6465805158894952, + "learning_rate": 8.292682926829268e-06, + "loss": 1.091, + "num_tokens": 4028500.0, + "step": 35 + }, + { + "epoch": 0.2656826568265683, + "grad_norm": 0.6651254191562199, + "learning_rate": 8.536585365853658e-06, + "loss": 1.0302, + "num_tokens": 4122817.0, + "step": 36 + }, + { + "epoch": 0.2730627306273063, + "grad_norm": 0.6521370198607039, + "learning_rate": 8.78048780487805e-06, + "loss": 1.04, + "num_tokens": 4224938.0, + "step": 37 + }, + { + "epoch": 0.28044280442804426, + "grad_norm": 0.6007174831688549, + "learning_rate": 9.02439024390244e-06, + "loss": 1.0265, + "num_tokens": 4333124.0, + "step": 38 + }, + { + "epoch": 0.2878228782287823, + "grad_norm": 0.551059225568472, + "learning_rate": 9.268292682926831e-06, + "loss": 1.0287, + "num_tokens": 4456232.0, + "step": 39 + }, + { + "epoch": 0.2952029520295203, + "grad_norm": 0.6437675813439927, + "learning_rate": 9.51219512195122e-06, + "loss": 1.1779, + "num_tokens": 4542519.0, + "step": 40 + }, + { + "epoch": 0.3025830258302583, + "grad_norm": 0.480473097605294, + "learning_rate": 9.756097560975611e-06, + "loss": 0.8197, + "num_tokens": 4687640.0, + "step": 41 + }, + { + "epoch": 0.30996309963099633, + "grad_norm": 0.5120008356671397, + "learning_rate": 1e-05, + "loss": 0.9641, + "num_tokens": 4789703.0, + "step": 42 + }, + { + "epoch": 0.3173431734317343, + "grad_norm": 0.4568876499652346, + "learning_rate": 9.999987040066834e-06, + "loss": 1.0052, + "num_tokens": 4915970.0, + "step": 43 + }, + { + "epoch": 0.3247232472324723, + "grad_norm": 0.5913746713257689, + "learning_rate": 9.99994816034198e-06, + "loss": 1.059, + "num_tokens": 5017958.0, + "step": 44 + }, + { + "epoch": 0.33210332103321033, + "grad_norm": 0.46500058691449936, + "learning_rate": 9.999883361049389e-06, + "loss": 1.012, + "num_tokens": 5122962.0, + "step": 45 + }, + { + "epoch": 0.33948339483394835, + "grad_norm": 0.5570928695789232, + "learning_rate": 9.999792642562297e-06, + "loss": 0.9646, + "num_tokens": 5218400.0, + "step": 46 + }, + { + "epoch": 0.34686346863468637, + "grad_norm": 0.5153854617107293, + "learning_rate": 9.999676005403246e-06, + "loss": 1.0834, + "num_tokens": 5321991.0, + "step": 47 + }, + { + "epoch": 0.35424354243542433, + "grad_norm": 0.49149562713241135, + "learning_rate": 9.99953345024406e-06, + "loss": 0.9621, + "num_tokens": 5422853.0, + "step": 48 + }, + { + "epoch": 0.36162361623616235, + "grad_norm": 0.5615732057290855, + "learning_rate": 9.99936497790585e-06, + "loss": 1.0506, + "num_tokens": 5515879.0, + "step": 49 + }, + { + "epoch": 0.36900369003690037, + "grad_norm": 0.571733530601376, + "learning_rate": 9.999170589359015e-06, + "loss": 1.0696, + "num_tokens": 5623737.0, + "step": 50 + }, + { + "epoch": 0.3763837638376384, + "grad_norm": 0.5032065722860658, + "learning_rate": 9.998950285723228e-06, + "loss": 0.9468, + "num_tokens": 5740532.0, + "step": 51 + }, + { + "epoch": 0.3837638376383764, + "grad_norm": 0.5659681942216157, + "learning_rate": 9.998704068267427e-06, + "loss": 1.1049, + "num_tokens": 5847705.0, + "step": 52 + }, + { + "epoch": 0.39114391143911437, + "grad_norm": 0.4782117232104864, + "learning_rate": 9.998431938409822e-06, + "loss": 1.0344, + "num_tokens": 5978117.0, + "step": 53 + }, + { + "epoch": 0.3985239852398524, + "grad_norm": 0.4647019175418653, + "learning_rate": 9.998133897717868e-06, + "loss": 0.9418, + "num_tokens": 6095872.0, + "step": 54 + }, + { + "epoch": 0.4059040590405904, + "grad_norm": 0.5143036254859215, + "learning_rate": 9.997809947908275e-06, + "loss": 1.0138, + "num_tokens": 6197245.0, + "step": 55 + }, + { + "epoch": 0.4132841328413284, + "grad_norm": 0.4808656273298818, + "learning_rate": 9.997460090846982e-06, + "loss": 0.9426, + "num_tokens": 6302933.0, + "step": 56 + }, + { + "epoch": 0.42066420664206644, + "grad_norm": 0.5754826460700532, + "learning_rate": 9.997084328549156e-06, + "loss": 1.086, + "num_tokens": 6395435.0, + "step": 57 + }, + { + "epoch": 0.4280442804428044, + "grad_norm": 0.5614265194817842, + "learning_rate": 9.996682663179175e-06, + "loss": 1.0444, + "num_tokens": 6509555.0, + "step": 58 + }, + { + "epoch": 0.4354243542435424, + "grad_norm": 0.5712826432269551, + "learning_rate": 9.996255097050624e-06, + "loss": 1.0086, + "num_tokens": 6617826.0, + "step": 59 + }, + { + "epoch": 0.44280442804428044, + "grad_norm": 0.563978826243104, + "learning_rate": 9.995801632626267e-06, + "loss": 1.0746, + "num_tokens": 6715676.0, + "step": 60 + }, + { + "epoch": 0.45018450184501846, + "grad_norm": 0.5513032046654982, + "learning_rate": 9.995322272518046e-06, + "loss": 1.0607, + "num_tokens": 6809492.0, + "step": 61 + }, + { + "epoch": 0.4575645756457565, + "grad_norm": 0.4974539118984009, + "learning_rate": 9.994817019487061e-06, + "loss": 0.952, + "num_tokens": 6914850.0, + "step": 62 + }, + { + "epoch": 0.46494464944649444, + "grad_norm": 0.438511917566343, + "learning_rate": 9.994285876443558e-06, + "loss": 0.9537, + "num_tokens": 7039180.0, + "step": 63 + }, + { + "epoch": 0.47232472324723246, + "grad_norm": 0.5378396549591032, + "learning_rate": 9.993728846446903e-06, + "loss": 1.0574, + "num_tokens": 7128575.0, + "step": 64 + }, + { + "epoch": 0.4797047970479705, + "grad_norm": 0.6082465859999698, + "learning_rate": 9.993145932705569e-06, + "loss": 1.0524, + "num_tokens": 7217972.0, + "step": 65 + }, + { + "epoch": 0.4870848708487085, + "grad_norm": 0.5103591233335351, + "learning_rate": 9.992537138577125e-06, + "loss": 1.0469, + "num_tokens": 7354135.0, + "step": 66 + }, + { + "epoch": 0.4944649446494465, + "grad_norm": 0.4621068253841004, + "learning_rate": 9.991902467568208e-06, + "loss": 0.9382, + "num_tokens": 7471432.0, + "step": 67 + }, + { + "epoch": 0.5018450184501845, + "grad_norm": 0.45732749008487794, + "learning_rate": 9.991241923334503e-06, + "loss": 0.9203, + "num_tokens": 7590339.0, + "step": 68 + }, + { + "epoch": 0.5092250922509225, + "grad_norm": 0.4662000389870386, + "learning_rate": 9.990555509680723e-06, + "loss": 0.9102, + "num_tokens": 7711707.0, + "step": 69 + }, + { + "epoch": 0.5166051660516605, + "grad_norm": 0.43875655100732047, + "learning_rate": 9.989843230560593e-06, + "loss": 0.8622, + "num_tokens": 7826782.0, + "step": 70 + }, + { + "epoch": 0.5239852398523985, + "grad_norm": 0.5489110482145291, + "learning_rate": 9.98910509007682e-06, + "loss": 1.0403, + "num_tokens": 7976473.0, + "step": 71 + }, + { + "epoch": 0.5313653136531366, + "grad_norm": 0.4574193609141772, + "learning_rate": 9.988341092481069e-06, + "loss": 0.9335, + "num_tokens": 8115460.0, + "step": 72 + }, + { + "epoch": 0.5387453874538746, + "grad_norm": 0.49070862262475273, + "learning_rate": 9.987551242173945e-06, + "loss": 1.294, + "num_tokens": 8250050.0, + "step": 73 + }, + { + "epoch": 0.5461254612546126, + "grad_norm": 0.4234076913104516, + "learning_rate": 9.986735543704961e-06, + "loss": 0.9397, + "num_tokens": 8388396.0, + "step": 74 + }, + { + "epoch": 0.5535055350553506, + "grad_norm": 0.46710664453501544, + "learning_rate": 9.985894001772519e-06, + "loss": 1.0268, + "num_tokens": 8519930.0, + "step": 75 + }, + { + "epoch": 0.5608856088560885, + "grad_norm": 0.555996347697104, + "learning_rate": 9.98502662122387e-06, + "loss": 1.0803, + "num_tokens": 8613426.0, + "step": 76 + }, + { + "epoch": 0.5682656826568265, + "grad_norm": 0.5211423653500821, + "learning_rate": 9.984133407055105e-06, + "loss": 1.0403, + "num_tokens": 8711173.0, + "step": 77 + }, + { + "epoch": 0.5756457564575646, + "grad_norm": 0.5516158654172905, + "learning_rate": 9.98321436441111e-06, + "loss": 1.1208, + "num_tokens": 8807954.0, + "step": 78 + }, + { + "epoch": 0.5830258302583026, + "grad_norm": 0.5539639014046897, + "learning_rate": 9.982269498585542e-06, + "loss": 1.0325, + "num_tokens": 8903831.0, + "step": 79 + }, + { + "epoch": 0.5904059040590406, + "grad_norm": 0.575542238443959, + "learning_rate": 9.981298815020804e-06, + "loss": 1.0463, + "num_tokens": 9003846.0, + "step": 80 + }, + { + "epoch": 0.5977859778597786, + "grad_norm": 0.45423267003556045, + "learning_rate": 9.980302319307998e-06, + "loss": 1.0481, + "num_tokens": 9130236.0, + "step": 81 + }, + { + "epoch": 0.6051660516605166, + "grad_norm": 0.463031467112172, + "learning_rate": 9.979280017186915e-06, + "loss": 1.0374, + "num_tokens": 9256557.0, + "step": 82 + }, + { + "epoch": 0.6125461254612546, + "grad_norm": 0.4416973693733997, + "learning_rate": 9.978231914545981e-06, + "loss": 0.9264, + "num_tokens": 9373670.0, + "step": 83 + }, + { + "epoch": 0.6199261992619927, + "grad_norm": 0.5238508886142513, + "learning_rate": 9.977158017422241e-06, + "loss": 1.0262, + "num_tokens": 9491588.0, + "step": 84 + }, + { + "epoch": 0.6273062730627307, + "grad_norm": 0.442540814989928, + "learning_rate": 9.976058332001307e-06, + "loss": 0.9437, + "num_tokens": 9616906.0, + "step": 85 + }, + { + "epoch": 0.6346863468634686, + "grad_norm": 0.5112364524346023, + "learning_rate": 9.974932864617333e-06, + "loss": 1.0335, + "num_tokens": 9723859.0, + "step": 86 + }, + { + "epoch": 0.6420664206642066, + "grad_norm": 0.5358861902240765, + "learning_rate": 9.973781621752982e-06, + "loss": 1.2516, + "num_tokens": 9839708.0, + "step": 87 + }, + { + "epoch": 0.6494464944649446, + "grad_norm": 0.5382936334494667, + "learning_rate": 9.972604610039376e-06, + "loss": 1.0241, + "num_tokens": 9934614.0, + "step": 88 + }, + { + "epoch": 0.6568265682656826, + "grad_norm": 0.4511647212099882, + "learning_rate": 9.971401836256066e-06, + "loss": 1.0173, + "num_tokens": 10052192.0, + "step": 89 + }, + { + "epoch": 0.6642066420664207, + "grad_norm": 0.6209524506305847, + "learning_rate": 9.970173307330998e-06, + "loss": 1.1109, + "num_tokens": 10131429.0, + "step": 90 + }, + { + "epoch": 0.6715867158671587, + "grad_norm": 0.4760295224336866, + "learning_rate": 9.968919030340458e-06, + "loss": 0.9765, + "num_tokens": 10252935.0, + "step": 91 + }, + { + "epoch": 0.6789667896678967, + "grad_norm": 0.632070574170874, + "learning_rate": 9.967639012509046e-06, + "loss": 1.0778, + "num_tokens": 10356066.0, + "step": 92 + }, + { + "epoch": 0.6863468634686347, + "grad_norm": 0.4819917337977906, + "learning_rate": 9.966333261209625e-06, + "loss": 0.957, + "num_tokens": 10466192.0, + "step": 93 + }, + { + "epoch": 0.6937269372693727, + "grad_norm": 0.5066375366228465, + "learning_rate": 9.965001783963287e-06, + "loss": 1.0014, + "num_tokens": 10569535.0, + "step": 94 + }, + { + "epoch": 0.7011070110701108, + "grad_norm": 0.450509316551867, + "learning_rate": 9.963644588439297e-06, + "loss": 0.9054, + "num_tokens": 10688885.0, + "step": 95 + }, + { + "epoch": 0.7084870848708487, + "grad_norm": 0.44224377901493983, + "learning_rate": 9.962261682455065e-06, + "loss": 1.35, + "num_tokens": 10856147.0, + "step": 96 + }, + { + "epoch": 0.7158671586715867, + "grad_norm": 0.4777650383162576, + "learning_rate": 9.960853073976086e-06, + "loss": 1.0102, + "num_tokens": 10960833.0, + "step": 97 + }, + { + "epoch": 0.7232472324723247, + "grad_norm": 0.5398453918419668, + "learning_rate": 9.959418771115904e-06, + "loss": 1.0557, + "num_tokens": 11062207.0, + "step": 98 + }, + { + "epoch": 0.7306273062730627, + "grad_norm": 0.530337299319452, + "learning_rate": 9.95795878213606e-06, + "loss": 0.9023, + "num_tokens": 11171307.0, + "step": 99 + }, + { + "epoch": 0.7380073800738007, + "grad_norm": 0.5622636104830259, + "learning_rate": 9.956473115446049e-06, + "loss": 1.0633, + "num_tokens": 11254005.0, + "step": 100 + }, + { + "epoch": 0.7453874538745388, + "grad_norm": 0.46456704334968907, + "learning_rate": 9.954961779603264e-06, + "loss": 0.927, + "num_tokens": 11394665.0, + "step": 101 + }, + { + "epoch": 0.7527675276752768, + "grad_norm": 0.5488257951604908, + "learning_rate": 9.953424783312957e-06, + "loss": 1.0226, + "num_tokens": 11476158.0, + "step": 102 + }, + { + "epoch": 0.7601476014760148, + "grad_norm": 0.4300034767733986, + "learning_rate": 9.95186213542818e-06, + "loss": 1.0092, + "num_tokens": 11600918.0, + "step": 103 + }, + { + "epoch": 0.7675276752767528, + "grad_norm": 0.4938744510670259, + "learning_rate": 9.950273844949737e-06, + "loss": 0.9517, + "num_tokens": 11709035.0, + "step": 104 + }, + { + "epoch": 0.7749077490774908, + "grad_norm": 0.5036060443814918, + "learning_rate": 9.948659921026139e-06, + "loss": 0.9781, + "num_tokens": 11802949.0, + "step": 105 + }, + { + "epoch": 0.7822878228782287, + "grad_norm": 0.5610444858635553, + "learning_rate": 9.947020372953533e-06, + "loss": 1.0902, + "num_tokens": 11890336.0, + "step": 106 + }, + { + "epoch": 0.7896678966789668, + "grad_norm": 0.5163528393079438, + "learning_rate": 9.945355210175673e-06, + "loss": 0.9623, + "num_tokens": 11984884.0, + "step": 107 + }, + { + "epoch": 0.7970479704797048, + "grad_norm": 0.5159615372931292, + "learning_rate": 9.943664442283845e-06, + "loss": 1.0683, + "num_tokens": 12077229.0, + "step": 108 + }, + { + "epoch": 0.8044280442804428, + "grad_norm": 0.5130347704349766, + "learning_rate": 9.94194807901682e-06, + "loss": 1.0055, + "num_tokens": 12173116.0, + "step": 109 + }, + { + "epoch": 0.8118081180811808, + "grad_norm": 0.4554463222781016, + "learning_rate": 9.9402061302608e-06, + "loss": 0.97, + "num_tokens": 12280224.0, + "step": 110 + }, + { + "epoch": 0.8191881918819188, + "grad_norm": 0.49800466527382514, + "learning_rate": 9.938438606049362e-06, + "loss": 1.0198, + "num_tokens": 12377281.0, + "step": 111 + }, + { + "epoch": 0.8265682656826568, + "grad_norm": 0.6048015082550192, + "learning_rate": 9.936645516563387e-06, + "loss": 1.0646, + "num_tokens": 12453177.0, + "step": 112 + }, + { + "epoch": 0.8339483394833949, + "grad_norm": 0.5217564431490526, + "learning_rate": 9.934826872131024e-06, + "loss": 1.1159, + "num_tokens": 12547604.0, + "step": 113 + }, + { + "epoch": 0.8413284132841329, + "grad_norm": 0.5421231223025501, + "learning_rate": 9.932982683227606e-06, + "loss": 1.1143, + "num_tokens": 12637010.0, + "step": 114 + }, + { + "epoch": 0.8487084870848709, + "grad_norm": 0.47714060338286013, + "learning_rate": 9.931112960475606e-06, + "loss": 0.941, + "num_tokens": 12740899.0, + "step": 115 + }, + { + "epoch": 0.8560885608856088, + "grad_norm": 0.5526084769825411, + "learning_rate": 9.929217714644574e-06, + "loss": 0.9838, + "num_tokens": 12837300.0, + "step": 116 + }, + { + "epoch": 0.8634686346863468, + "grad_norm": 0.4666579296080565, + "learning_rate": 9.927296956651069e-06, + "loss": 0.907, + "num_tokens": 12945601.0, + "step": 117 + }, + { + "epoch": 0.8708487084870848, + "grad_norm": 0.4740058144949936, + "learning_rate": 9.925350697558598e-06, + "loss": 0.9664, + "num_tokens": 13052482.0, + "step": 118 + }, + { + "epoch": 0.8782287822878229, + "grad_norm": 0.47431280302190226, + "learning_rate": 9.92337894857756e-06, + "loss": 1.0038, + "num_tokens": 13183801.0, + "step": 119 + }, + { + "epoch": 0.8856088560885609, + "grad_norm": 0.526188736768355, + "learning_rate": 9.921381721065164e-06, + "loss": 0.9861, + "num_tokens": 13290010.0, + "step": 120 + }, + { + "epoch": 0.8929889298892989, + "grad_norm": 0.5201598156189171, + "learning_rate": 9.919359026525389e-06, + "loss": 0.9604, + "num_tokens": 13392353.0, + "step": 121 + }, + { + "epoch": 0.9003690036900369, + "grad_norm": 0.48530077756820317, + "learning_rate": 9.91731087660889e-06, + "loss": 0.9916, + "num_tokens": 13499246.0, + "step": 122 + }, + { + "epoch": 0.9077490774907749, + "grad_norm": 0.515320285644199, + "learning_rate": 9.91523728311295e-06, + "loss": 1.0087, + "num_tokens": 13584502.0, + "step": 123 + }, + { + "epoch": 0.915129151291513, + "grad_norm": 0.46818818518124683, + "learning_rate": 9.913138257981408e-06, + "loss": 0.9929, + "num_tokens": 13710703.0, + "step": 124 + }, + { + "epoch": 0.922509225092251, + "grad_norm": 0.46147504674366924, + "learning_rate": 9.911013813304584e-06, + "loss": 0.9492, + "num_tokens": 13827190.0, + "step": 125 + }, + { + "epoch": 0.9298892988929889, + "grad_norm": 0.44841838892898456, + "learning_rate": 9.90886396131922e-06, + "loss": 0.9587, + "num_tokens": 13942355.0, + "step": 126 + }, + { + "epoch": 0.9372693726937269, + "grad_norm": 0.4667768028159005, + "learning_rate": 9.906688714408396e-06, + "loss": 0.9677, + "num_tokens": 14045098.0, + "step": 127 + }, + { + "epoch": 0.9446494464944649, + "grad_norm": 0.47864280022738454, + "learning_rate": 9.904488085101472e-06, + "loss": 0.9383, + "num_tokens": 14138298.0, + "step": 128 + }, + { + "epoch": 0.9520295202952029, + "grad_norm": 0.5148983443269514, + "learning_rate": 9.902262086074005e-06, + "loss": 1.0295, + "num_tokens": 14256039.0, + "step": 129 + }, + { + "epoch": 0.959409594095941, + "grad_norm": 0.5093081614806382, + "learning_rate": 9.900010730147685e-06, + "loss": 0.9741, + "num_tokens": 14377661.0, + "step": 130 + }, + { + "epoch": 0.966789667896679, + "grad_norm": 0.4647124416526912, + "learning_rate": 9.897734030290254e-06, + "loss": 0.9618, + "num_tokens": 14497517.0, + "step": 131 + }, + { + "epoch": 0.974169741697417, + "grad_norm": 0.46313506939147736, + "learning_rate": 9.895431999615436e-06, + "loss": 1.0342, + "num_tokens": 14622979.0, + "step": 132 + }, + { + "epoch": 0.981549815498155, + "grad_norm": 0.40892038773527534, + "learning_rate": 9.893104651382863e-06, + "loss": 0.912, + "num_tokens": 14753899.0, + "step": 133 + }, + { + "epoch": 0.988929889298893, + "grad_norm": 0.566168820425785, + "learning_rate": 9.890751998997986e-06, + "loss": 1.0993, + "num_tokens": 14834866.0, + "step": 134 + }, + { + "epoch": 0.996309963099631, + "grad_norm": 0.4592197488603727, + "learning_rate": 9.888374056012016e-06, + "loss": 0.9305, + "num_tokens": 14952619.0, + "step": 135 + }, + { + "epoch": 1.0, + "grad_norm": 0.4592197488603727, + "learning_rate": 9.885970836121833e-06, + "loss": 1.037, + "num_tokens": 15018208.0, + "step": 136 + }, + { + "epoch": 1.007380073800738, + "grad_norm": 0.7989877914889159, + "learning_rate": 9.88354235316991e-06, + "loss": 0.9263, + "num_tokens": 15133428.0, + "step": 137 + }, + { + "epoch": 1.014760147601476, + "grad_norm": 0.49591188266357994, + "learning_rate": 9.881088621144242e-06, + "loss": 0.8977, + "num_tokens": 15231140.0, + "step": 138 + }, + { + "epoch": 1.022140221402214, + "grad_norm": 0.4543003054132371, + "learning_rate": 9.87860965417825e-06, + "loss": 0.8595, + "num_tokens": 15374666.0, + "step": 139 + }, + { + "epoch": 1.029520295202952, + "grad_norm": 0.4585183506218285, + "learning_rate": 9.876105466550708e-06, + "loss": 0.9014, + "num_tokens": 15481846.0, + "step": 140 + }, + { + "epoch": 1.03690036900369, + "grad_norm": 0.43441901355258156, + "learning_rate": 9.873576072685665e-06, + "loss": 0.9625, + "num_tokens": 15599370.0, + "step": 141 + }, + { + "epoch": 1.044280442804428, + "grad_norm": 0.4461158736534833, + "learning_rate": 9.871021487152353e-06, + "loss": 0.9303, + "num_tokens": 15720156.0, + "step": 142 + }, + { + "epoch": 1.051660516605166, + "grad_norm": 0.7840474037444799, + "learning_rate": 9.86844172466511e-06, + "loss": 0.8877, + "num_tokens": 15829906.0, + "step": 143 + }, + { + "epoch": 1.0590405904059041, + "grad_norm": 0.4502214845749829, + "learning_rate": 9.865836800083291e-06, + "loss": 0.8896, + "num_tokens": 15940844.0, + "step": 144 + }, + { + "epoch": 1.066420664206642, + "grad_norm": 0.4981431652238191, + "learning_rate": 9.863206728411184e-06, + "loss": 0.82, + "num_tokens": 16047950.0, + "step": 145 + }, + { + "epoch": 1.07380073800738, + "grad_norm": 0.4894324276236524, + "learning_rate": 9.860551524797922e-06, + "loss": 0.8665, + "num_tokens": 16162047.0, + "step": 146 + }, + { + "epoch": 1.081180811808118, + "grad_norm": 0.4757379804176477, + "learning_rate": 9.857871204537403e-06, + "loss": 0.8054, + "num_tokens": 16282646.0, + "step": 147 + }, + { + "epoch": 1.088560885608856, + "grad_norm": 0.48374247406787124, + "learning_rate": 9.855165783068188e-06, + "loss": 0.8617, + "num_tokens": 16386527.0, + "step": 148 + }, + { + "epoch": 1.0959409594095941, + "grad_norm": 0.4336051240979257, + "learning_rate": 9.852435275973427e-06, + "loss": 0.7327, + "num_tokens": 16496015.0, + "step": 149 + }, + { + "epoch": 1.103321033210332, + "grad_norm": 0.519627688116221, + "learning_rate": 9.849679698980757e-06, + "loss": 0.9757, + "num_tokens": 16582017.0, + "step": 150 + }, + { + "epoch": 1.1107011070110702, + "grad_norm": 0.4941153232037019, + "learning_rate": 9.846899067962223e-06, + "loss": 0.8149, + "num_tokens": 16697936.0, + "step": 151 + }, + { + "epoch": 1.118081180811808, + "grad_norm": 0.44527519190153636, + "learning_rate": 9.844093398934175e-06, + "loss": 0.795, + "num_tokens": 16830125.0, + "step": 152 + }, + { + "epoch": 1.1254612546125462, + "grad_norm": 0.4603797119386462, + "learning_rate": 9.841262708057183e-06, + "loss": 0.8797, + "num_tokens": 16927202.0, + "step": 153 + }, + { + "epoch": 1.132841328413284, + "grad_norm": 0.5086903095824973, + "learning_rate": 9.838407011635944e-06, + "loss": 0.8436, + "num_tokens": 17036892.0, + "step": 154 + }, + { + "epoch": 1.140221402214022, + "grad_norm": 0.5071025376685536, + "learning_rate": 9.835526326119183e-06, + "loss": 0.8922, + "num_tokens": 17146176.0, + "step": 155 + }, + { + "epoch": 1.1476014760147601, + "grad_norm": 0.4637035144739838, + "learning_rate": 9.832620668099566e-06, + "loss": 0.744, + "num_tokens": 17280168.0, + "step": 156 + }, + { + "epoch": 1.1549815498154983, + "grad_norm": 0.5528942275682165, + "learning_rate": 9.829690054313592e-06, + "loss": 0.9498, + "num_tokens": 17389972.0, + "step": 157 + }, + { + "epoch": 1.1623616236162362, + "grad_norm": 0.531456746186789, + "learning_rate": 9.826734501641512e-06, + "loss": 0.863, + "num_tokens": 17479848.0, + "step": 158 + }, + { + "epoch": 1.169741697416974, + "grad_norm": 0.5276513234105304, + "learning_rate": 9.823754027107221e-06, + "loss": 0.7905, + "num_tokens": 17575558.0, + "step": 159 + }, + { + "epoch": 1.1771217712177122, + "grad_norm": 0.5195819159511271, + "learning_rate": 9.820748647878166e-06, + "loss": 0.8966, + "num_tokens": 17677486.0, + "step": 160 + }, + { + "epoch": 1.1845018450184501, + "grad_norm": 0.5211095057318215, + "learning_rate": 9.81771838126524e-06, + "loss": 0.789, + "num_tokens": 17777722.0, + "step": 161 + }, + { + "epoch": 1.1918819188191883, + "grad_norm": 0.46350237811105727, + "learning_rate": 9.814663244722689e-06, + "loss": 0.8838, + "num_tokens": 17906149.0, + "step": 162 + }, + { + "epoch": 1.1992619926199262, + "grad_norm": 0.44572953162057055, + "learning_rate": 9.811583255848005e-06, + "loss": 0.7961, + "num_tokens": 18031173.0, + "step": 163 + }, + { + "epoch": 1.2066420664206643, + "grad_norm": 0.4512198146226335, + "learning_rate": 9.808478432381841e-06, + "loss": 0.8843, + "num_tokens": 18134079.0, + "step": 164 + }, + { + "epoch": 1.2140221402214022, + "grad_norm": 0.482052383393916, + "learning_rate": 9.805348792207883e-06, + "loss": 0.8385, + "num_tokens": 18242380.0, + "step": 165 + }, + { + "epoch": 1.2214022140221403, + "grad_norm": 0.4637735488794427, + "learning_rate": 9.802194353352765e-06, + "loss": 0.8381, + "num_tokens": 18343696.0, + "step": 166 + }, + { + "epoch": 1.2287822878228782, + "grad_norm": 0.4184603981198051, + "learning_rate": 9.79901513398596e-06, + "loss": 0.8284, + "num_tokens": 18448780.0, + "step": 167 + }, + { + "epoch": 1.2361623616236161, + "grad_norm": 0.5906440020601853, + "learning_rate": 9.79581115241968e-06, + "loss": 0.8404, + "num_tokens": 18543863.0, + "step": 168 + }, + { + "epoch": 1.2435424354243543, + "grad_norm": 0.47936881644553336, + "learning_rate": 9.792582427108762e-06, + "loss": 0.947, + "num_tokens": 18652628.0, + "step": 169 + }, + { + "epoch": 1.2509225092250922, + "grad_norm": 0.5086167685204926, + "learning_rate": 9.789328976650568e-06, + "loss": 0.8962, + "num_tokens": 18760319.0, + "step": 170 + }, + { + "epoch": 1.2583025830258303, + "grad_norm": 0.47902043720502113, + "learning_rate": 9.786050819784877e-06, + "loss": 0.7773, + "num_tokens": 18860975.0, + "step": 171 + }, + { + "epoch": 1.2656826568265682, + "grad_norm": 0.49439604693271255, + "learning_rate": 9.782747975393776e-06, + "loss": 0.7651, + "num_tokens": 18944704.0, + "step": 172 + }, + { + "epoch": 1.2730627306273063, + "grad_norm": 0.554868265270938, + "learning_rate": 9.779420462501548e-06, + "loss": 0.9466, + "num_tokens": 19057375.0, + "step": 173 + }, + { + "epoch": 1.2804428044280443, + "grad_norm": 0.5082989408166297, + "learning_rate": 9.776068300274568e-06, + "loss": 0.8912, + "num_tokens": 19175395.0, + "step": 174 + }, + { + "epoch": 1.2878228782287824, + "grad_norm": 0.5311691180886349, + "learning_rate": 9.772691508021194e-06, + "loss": 0.869, + "num_tokens": 19274879.0, + "step": 175 + }, + { + "epoch": 1.2952029520295203, + "grad_norm": 0.5001485232951035, + "learning_rate": 9.769290105191649e-06, + "loss": 0.8186, + "num_tokens": 19395971.0, + "step": 176 + }, + { + "epoch": 1.3025830258302582, + "grad_norm": 0.4993351556405326, + "learning_rate": 9.765864111377906e-06, + "loss": 0.8036, + "num_tokens": 19487184.0, + "step": 177 + }, + { + "epoch": 1.3099630996309963, + "grad_norm": 0.5117651842979043, + "learning_rate": 9.762413546313597e-06, + "loss": 0.932, + "num_tokens": 19577791.0, + "step": 178 + }, + { + "epoch": 1.3173431734317342, + "grad_norm": 0.504664400769316, + "learning_rate": 9.758938429873867e-06, + "loss": 0.8499, + "num_tokens": 19680531.0, + "step": 179 + }, + { + "epoch": 1.3247232472324724, + "grad_norm": 0.5338570837550553, + "learning_rate": 9.755438782075285e-06, + "loss": 0.8251, + "num_tokens": 19784918.0, + "step": 180 + }, + { + "epoch": 1.3321033210332103, + "grad_norm": 0.4593700755128409, + "learning_rate": 9.751914623075724e-06, + "loss": 0.8421, + "num_tokens": 19885160.0, + "step": 181 + }, + { + "epoch": 1.3394833948339484, + "grad_norm": 0.6484547228982595, + "learning_rate": 9.748365973174228e-06, + "loss": 1.2993, + "num_tokens": 20025884.0, + "step": 182 + }, + { + "epoch": 1.3468634686346863, + "grad_norm": 0.5525718219712958, + "learning_rate": 9.744792852810916e-06, + "loss": 0.8541, + "num_tokens": 20117897.0, + "step": 183 + }, + { + "epoch": 1.3542435424354244, + "grad_norm": 0.4998012568880781, + "learning_rate": 9.74119528256686e-06, + "loss": 0.8986, + "num_tokens": 20211673.0, + "step": 184 + }, + { + "epoch": 1.3616236162361623, + "grad_norm": 0.5298929870281555, + "learning_rate": 9.737573283163952e-06, + "loss": 0.9566, + "num_tokens": 20356074.0, + "step": 185 + }, + { + "epoch": 1.3690036900369003, + "grad_norm": 0.5210495050174606, + "learning_rate": 9.733926875464805e-06, + "loss": 0.8431, + "num_tokens": 20507950.0, + "step": 186 + }, + { + "epoch": 1.3763837638376384, + "grad_norm": 0.46728864361697287, + "learning_rate": 9.730256080472618e-06, + "loss": 0.8116, + "num_tokens": 20610309.0, + "step": 187 + }, + { + "epoch": 1.3837638376383765, + "grad_norm": 0.4881675871337124, + "learning_rate": 9.72656091933106e-06, + "loss": 0.8415, + "num_tokens": 20716085.0, + "step": 188 + }, + { + "epoch": 1.3911439114391144, + "grad_norm": 0.4585691063343814, + "learning_rate": 9.722841413324148e-06, + "loss": 0.9216, + "num_tokens": 20824644.0, + "step": 189 + }, + { + "epoch": 1.3985239852398523, + "grad_norm": 0.5277282283133232, + "learning_rate": 9.719097583876131e-06, + "loss": 0.8181, + "num_tokens": 20951577.0, + "step": 190 + }, + { + "epoch": 1.4059040590405905, + "grad_norm": 0.4990143013122359, + "learning_rate": 9.715329452551351e-06, + "loss": 0.867, + "num_tokens": 21047084.0, + "step": 191 + }, + { + "epoch": 1.4132841328413284, + "grad_norm": 0.4971444658278234, + "learning_rate": 9.711537041054135e-06, + "loss": 0.8007, + "num_tokens": 21143609.0, + "step": 192 + }, + { + "epoch": 1.4206642066420665, + "grad_norm": 0.5042163135566892, + "learning_rate": 9.70772037122866e-06, + "loss": 0.7865, + "num_tokens": 21260697.0, + "step": 193 + }, + { + "epoch": 1.4280442804428044, + "grad_norm": 0.49618997802851683, + "learning_rate": 9.70387946505883e-06, + "loss": 0.8052, + "num_tokens": 21394978.0, + "step": 194 + }, + { + "epoch": 1.4354243542435423, + "grad_norm": 0.4394881712817789, + "learning_rate": 9.700014344668152e-06, + "loss": 0.8518, + "num_tokens": 21518161.0, + "step": 195 + }, + { + "epoch": 1.4428044280442804, + "grad_norm": 0.5185166994477352, + "learning_rate": 9.696125032319601e-06, + "loss": 0.8927, + "num_tokens": 21604465.0, + "step": 196 + }, + { + "epoch": 1.4501845018450186, + "grad_norm": 0.4853359471099058, + "learning_rate": 9.692211550415506e-06, + "loss": 0.7539, + "num_tokens": 21717937.0, + "step": 197 + }, + { + "epoch": 1.4575645756457565, + "grad_norm": 0.46363388748721307, + "learning_rate": 9.688273921497404e-06, + "loss": 0.7983, + "num_tokens": 21822988.0, + "step": 198 + }, + { + "epoch": 1.4649446494464944, + "grad_norm": 0.5486216548585544, + "learning_rate": 9.684312168245918e-06, + "loss": 0.9988, + "num_tokens": 21926485.0, + "step": 199 + }, + { + "epoch": 1.4723247232472325, + "grad_norm": 0.5953560651449461, + "learning_rate": 9.680326313480633e-06, + "loss": 0.8636, + "num_tokens": 22023009.0, + "step": 200 + }, + { + "epoch": 1.4797047970479704, + "grad_norm": 0.49361008034993414, + "learning_rate": 9.676316380159952e-06, + "loss": 0.8099, + "num_tokens": 22131849.0, + "step": 201 + }, + { + "epoch": 1.4870848708487086, + "grad_norm": 0.5002101213490427, + "learning_rate": 9.672282391380972e-06, + "loss": 0.7703, + "num_tokens": 22245342.0, + "step": 202 + }, + { + "epoch": 1.4944649446494465, + "grad_norm": 0.4785025045933051, + "learning_rate": 9.668224370379348e-06, + "loss": 0.8508, + "num_tokens": 22337769.0, + "step": 203 + }, + { + "epoch": 1.5018450184501844, + "grad_norm": 0.5435558274748625, + "learning_rate": 9.664142340529164e-06, + "loss": 0.7785, + "num_tokens": 22439058.0, + "step": 204 + }, + { + "epoch": 1.5092250922509225, + "grad_norm": 0.5532069442290298, + "learning_rate": 9.660036325342786e-06, + "loss": 0.8826, + "num_tokens": 22536430.0, + "step": 205 + }, + { + "epoch": 1.5166051660516606, + "grad_norm": 0.5307452858748856, + "learning_rate": 9.65590634847074e-06, + "loss": 0.851, + "num_tokens": 22632498.0, + "step": 206 + }, + { + "epoch": 1.5239852398523985, + "grad_norm": 0.4832687140091762, + "learning_rate": 9.651752433701574e-06, + "loss": 0.8294, + "num_tokens": 22735687.0, + "step": 207 + }, + { + "epoch": 1.5313653136531364, + "grad_norm": 0.5227010720446035, + "learning_rate": 9.64757460496171e-06, + "loss": 0.8114, + "num_tokens": 22864273.0, + "step": 208 + }, + { + "epoch": 1.5387453874538746, + "grad_norm": 0.49492647805506507, + "learning_rate": 9.64337288631532e-06, + "loss": 0.8636, + "num_tokens": 22962379.0, + "step": 209 + }, + { + "epoch": 1.5461254612546127, + "grad_norm": 0.4866765694699883, + "learning_rate": 9.639147301964175e-06, + "loss": 0.9815, + "num_tokens": 23072097.0, + "step": 210 + }, + { + "epoch": 1.5535055350553506, + "grad_norm": 0.5506302300377369, + "learning_rate": 9.63489787624752e-06, + "loss": 0.9361, + "num_tokens": 23180860.0, + "step": 211 + }, + { + "epoch": 1.5608856088560885, + "grad_norm": 0.4909442981205813, + "learning_rate": 9.630624633641918e-06, + "loss": 0.8018, + "num_tokens": 23326533.0, + "step": 212 + }, + { + "epoch": 1.5682656826568264, + "grad_norm": 0.403064429979273, + "learning_rate": 9.62632759876112e-06, + "loss": 0.7859, + "num_tokens": 23446416.0, + "step": 213 + }, + { + "epoch": 1.5756457564575646, + "grad_norm": 0.5048313215274081, + "learning_rate": 9.622006796355918e-06, + "loss": 0.7909, + "num_tokens": 23543993.0, + "step": 214 + }, + { + "epoch": 1.5830258302583027, + "grad_norm": 0.46500855134292074, + "learning_rate": 9.61766225131401e-06, + "loss": 0.8631, + "num_tokens": 23678207.0, + "step": 215 + }, + { + "epoch": 1.5904059040590406, + "grad_norm": 0.5254094413084759, + "learning_rate": 9.61329398865984e-06, + "loss": 0.8594, + "num_tokens": 23810928.0, + "step": 216 + }, + { + "epoch": 1.5977859778597785, + "grad_norm": 0.49497609979741825, + "learning_rate": 9.608902033554476e-06, + "loss": 0.8742, + "num_tokens": 23917356.0, + "step": 217 + }, + { + "epoch": 1.6051660516605166, + "grad_norm": 0.5417375416147627, + "learning_rate": 9.604486411295446e-06, + "loss": 0.8363, + "num_tokens": 24035253.0, + "step": 218 + }, + { + "epoch": 1.6125461254612548, + "grad_norm": 0.48750043128004844, + "learning_rate": 9.600047147316605e-06, + "loss": 0.7667, + "num_tokens": 24146736.0, + "step": 219 + }, + { + "epoch": 1.6199261992619927, + "grad_norm": 0.4505655779933478, + "learning_rate": 9.595584267187981e-06, + "loss": 0.7643, + "num_tokens": 24253499.0, + "step": 220 + }, + { + "epoch": 1.6273062730627306, + "grad_norm": 0.5037711126593194, + "learning_rate": 9.59109779661563e-06, + "loss": 0.8931, + "num_tokens": 24356714.0, + "step": 221 + }, + { + "epoch": 1.6346863468634685, + "grad_norm": 0.552666996933163, + "learning_rate": 9.586587761441491e-06, + "loss": 0.8319, + "num_tokens": 24439630.0, + "step": 222 + }, + { + "epoch": 1.6420664206642066, + "grad_norm": 0.5247298855240958, + "learning_rate": 9.582054187643233e-06, + "loss": 0.999, + "num_tokens": 24572473.0, + "step": 223 + }, + { + "epoch": 1.6494464944649447, + "grad_norm": 0.5246643217948993, + "learning_rate": 9.577497101334105e-06, + "loss": 0.8806, + "num_tokens": 24675846.0, + "step": 224 + }, + { + "epoch": 1.6568265682656826, + "grad_norm": 0.5114996684179052, + "learning_rate": 9.572916528762787e-06, + "loss": 0.8964, + "num_tokens": 24780580.0, + "step": 225 + }, + { + "epoch": 1.6642066420664205, + "grad_norm": 0.48230639883733895, + "learning_rate": 9.568312496313245e-06, + "loss": 0.8817, + "num_tokens": 24916338.0, + "step": 226 + }, + { + "epoch": 1.6715867158671587, + "grad_norm": 0.47555962389158385, + "learning_rate": 9.563685030504567e-06, + "loss": 0.8972, + "num_tokens": 25019133.0, + "step": 227 + }, + { + "epoch": 1.6789667896678968, + "grad_norm": 0.6237052756067296, + "learning_rate": 9.559034157990819e-06, + "loss": 0.7543, + "num_tokens": 25126580.0, + "step": 228 + }, + { + "epoch": 1.6863468634686347, + "grad_norm": 0.4704258168686015, + "learning_rate": 9.554359905560887e-06, + "loss": 0.8438, + "num_tokens": 25246893.0, + "step": 229 + }, + { + "epoch": 1.6937269372693726, + "grad_norm": 0.46206385798796146, + "learning_rate": 9.549662300138328e-06, + "loss": 0.8182, + "num_tokens": 25353087.0, + "step": 230 + }, + { + "epoch": 1.7011070110701108, + "grad_norm": 0.4972252613666411, + "learning_rate": 9.54494136878121e-06, + "loss": 0.8573, + "num_tokens": 25483872.0, + "step": 231 + }, + { + "epoch": 1.7084870848708487, + "grad_norm": 0.4414750299253366, + "learning_rate": 9.540197138681954e-06, + "loss": 0.7668, + "num_tokens": 25585638.0, + "step": 232 + }, + { + "epoch": 1.7158671586715868, + "grad_norm": 0.5554528041752966, + "learning_rate": 9.53542963716719e-06, + "loss": 1.0556, + "num_tokens": 25723031.0, + "step": 233 + }, + { + "epoch": 1.7232472324723247, + "grad_norm": 0.46030359376023355, + "learning_rate": 9.53063889169758e-06, + "loss": 0.915, + "num_tokens": 25839271.0, + "step": 234 + }, + { + "epoch": 1.7306273062730626, + "grad_norm": 0.5449032922287006, + "learning_rate": 9.525824929867679e-06, + "loss": 0.8693, + "num_tokens": 25925327.0, + "step": 235 + }, + { + "epoch": 1.7380073800738007, + "grad_norm": 0.4686019166095011, + "learning_rate": 9.52098777940576e-06, + "loss": 0.8072, + "num_tokens": 26057236.0, + "step": 236 + }, + { + "epoch": 1.7453874538745389, + "grad_norm": 0.4340016939308373, + "learning_rate": 9.516127468173674e-06, + "loss": 0.7962, + "num_tokens": 26166087.0, + "step": 237 + }, + { + "epoch": 1.7527675276752768, + "grad_norm": 0.5303575495741534, + "learning_rate": 9.511244024166661e-06, + "loss": 0.8571, + "num_tokens": 26255272.0, + "step": 238 + }, + { + "epoch": 1.7601476014760147, + "grad_norm": 0.5616358027384218, + "learning_rate": 9.506337475513216e-06, + "loss": 0.8681, + "num_tokens": 26353080.0, + "step": 239 + }, + { + "epoch": 1.7675276752767528, + "grad_norm": 0.5202222638637447, + "learning_rate": 9.501407850474916e-06, + "loss": 0.8949, + "num_tokens": 26439204.0, + "step": 240 + }, + { + "epoch": 1.774907749077491, + "grad_norm": 0.5609686292688814, + "learning_rate": 9.496455177446252e-06, + "loss": 0.8751, + "num_tokens": 26560904.0, + "step": 241 + }, + { + "epoch": 1.7822878228782288, + "grad_norm": 0.4613040830885771, + "learning_rate": 9.491479484954475e-06, + "loss": 0.869, + "num_tokens": 26665635.0, + "step": 242 + }, + { + "epoch": 1.7896678966789668, + "grad_norm": 0.5333172479366407, + "learning_rate": 9.486480801659423e-06, + "loss": 0.9239, + "num_tokens": 26769223.0, + "step": 243 + }, + { + "epoch": 1.7970479704797047, + "grad_norm": 0.4981884354859988, + "learning_rate": 9.481459156353368e-06, + "loss": 0.9425, + "num_tokens": 26876616.0, + "step": 244 + }, + { + "epoch": 1.8044280442804428, + "grad_norm": 0.4761851001292394, + "learning_rate": 9.476414577960835e-06, + "loss": 0.8853, + "num_tokens": 26966877.0, + "step": 245 + }, + { + "epoch": 1.811808118081181, + "grad_norm": 0.5918786962574727, + "learning_rate": 9.471347095538448e-06, + "loss": 0.836, + "num_tokens": 27062067.0, + "step": 246 + }, + { + "epoch": 1.8191881918819188, + "grad_norm": 0.47346571859338976, + "learning_rate": 9.46625673827475e-06, + "loss": 0.8121, + "num_tokens": 27233761.0, + "step": 247 + }, + { + "epoch": 1.8265682656826567, + "grad_norm": 0.43393941224602794, + "learning_rate": 9.461143535490053e-06, + "loss": 0.7822, + "num_tokens": 27351746.0, + "step": 248 + }, + { + "epoch": 1.8339483394833949, + "grad_norm": 0.4596856963327765, + "learning_rate": 9.45600751663625e-06, + "loss": 0.8125, + "num_tokens": 27470068.0, + "step": 249 + }, + { + "epoch": 1.841328413284133, + "grad_norm": 0.4623324180763746, + "learning_rate": 9.45084871129666e-06, + "loss": 0.8385, + "num_tokens": 27587540.0, + "step": 250 + }, + { + "epoch": 1.848708487084871, + "grad_norm": 0.4884311590737067, + "learning_rate": 9.445667149185846e-06, + "loss": 0.8584, + "num_tokens": 27699030.0, + "step": 251 + }, + { + "epoch": 1.8560885608856088, + "grad_norm": 0.5050542988903577, + "learning_rate": 9.440462860149452e-06, + "loss": 0.8695, + "num_tokens": 27815973.0, + "step": 252 + }, + { + "epoch": 1.8634686346863467, + "grad_norm": 0.4574188946697327, + "learning_rate": 9.435235874164029e-06, + "loss": 0.8468, + "num_tokens": 27911876.0, + "step": 253 + }, + { + "epoch": 1.8708487084870848, + "grad_norm": 0.5078878883639287, + "learning_rate": 9.429986221336861e-06, + "loss": 0.9666, + "num_tokens": 28024905.0, + "step": 254 + }, + { + "epoch": 1.878228782287823, + "grad_norm": 0.707997887987606, + "learning_rate": 9.424713931905793e-06, + "loss": 0.8271, + "num_tokens": 28127192.0, + "step": 255 + }, + { + "epoch": 1.8856088560885609, + "grad_norm": 0.5448729373134586, + "learning_rate": 9.419419036239053e-06, + "loss": 0.8077, + "num_tokens": 28212550.0, + "step": 256 + }, + { + "epoch": 1.8929889298892988, + "grad_norm": 0.46142556472216, + "learning_rate": 9.414101564835086e-06, + "loss": 0.7919, + "num_tokens": 28352190.0, + "step": 257 + }, + { + "epoch": 1.900369003690037, + "grad_norm": 0.4473495373569174, + "learning_rate": 9.408761548322367e-06, + "loss": 1.2457, + "num_tokens": 28507538.0, + "step": 258 + }, + { + "epoch": 1.907749077490775, + "grad_norm": 0.720865941907213, + "learning_rate": 9.403399017459236e-06, + "loss": 0.8916, + "num_tokens": 28630324.0, + "step": 259 + }, + { + "epoch": 1.915129151291513, + "grad_norm": 0.5405997607109805, + "learning_rate": 9.398014003133704e-06, + "loss": 0.8333, + "num_tokens": 28720013.0, + "step": 260 + }, + { + "epoch": 1.9225092250922509, + "grad_norm": 0.5243667632910098, + "learning_rate": 9.392606536363304e-06, + "loss": 0.8005, + "num_tokens": 28850176.0, + "step": 261 + }, + { + "epoch": 1.9298892988929888, + "grad_norm": 0.5120406103780775, + "learning_rate": 9.387176648294874e-06, + "loss": 0.8833, + "num_tokens": 28945819.0, + "step": 262 + }, + { + "epoch": 1.937269372693727, + "grad_norm": 0.46003493721185434, + "learning_rate": 9.381724370204414e-06, + "loss": 0.8105, + "num_tokens": 29069004.0, + "step": 263 + }, + { + "epoch": 1.944649446494465, + "grad_norm": 0.5105600061684051, + "learning_rate": 9.376249733496882e-06, + "loss": 0.852, + "num_tokens": 29197272.0, + "step": 264 + }, + { + "epoch": 1.952029520295203, + "grad_norm": 0.47786401535471007, + "learning_rate": 9.370752769706024e-06, + "loss": 0.8574, + "num_tokens": 29291901.0, + "step": 265 + }, + { + "epoch": 1.9594095940959408, + "grad_norm": 0.5006501587752776, + "learning_rate": 9.365233510494186e-06, + "loss": 0.8988, + "num_tokens": 29407240.0, + "step": 266 + }, + { + "epoch": 1.966789667896679, + "grad_norm": 0.5042822617717528, + "learning_rate": 9.35969198765214e-06, + "loss": 0.7268, + "num_tokens": 29503115.0, + "step": 267 + }, + { + "epoch": 1.974169741697417, + "grad_norm": 0.5223678106689094, + "learning_rate": 9.354128233098889e-06, + "loss": 0.907, + "num_tokens": 29612199.0, + "step": 268 + }, + { + "epoch": 1.981549815498155, + "grad_norm": 0.48980749434039345, + "learning_rate": 9.348542278881497e-06, + "loss": 0.9017, + "num_tokens": 29711314.0, + "step": 269 + }, + { + "epoch": 1.988929889298893, + "grad_norm": 0.5397711308358015, + "learning_rate": 9.342934157174895e-06, + "loss": 0.8368, + "num_tokens": 29858282.0, + "step": 270 + }, + { + "epoch": 1.996309963099631, + "grad_norm": 0.4018162467044364, + "learning_rate": 9.337303900281693e-06, + "loss": 0.7869, + "num_tokens": 29975582.0, + "step": 271 + }, + { + "epoch": 2.0, + "grad_norm": 0.7968766444431548, + "learning_rate": 9.33165154063201e-06, + "loss": 0.8073, + "num_tokens": 30038824.0, + "step": 272 + }, + { + "epoch": 2.007380073800738, + "grad_norm": 0.5215763819261514, + "learning_rate": 9.325977110783264e-06, + "loss": 0.7607, + "num_tokens": 30161706.0, + "step": 273 + }, + { + "epoch": 2.014760147601476, + "grad_norm": 0.4942048225579933, + "learning_rate": 9.320280643420006e-06, + "loss": 0.7386, + "num_tokens": 30287957.0, + "step": 274 + }, + { + "epoch": 2.022140221402214, + "grad_norm": 0.5113901907785446, + "learning_rate": 9.314562171353717e-06, + "loss": 0.7125, + "num_tokens": 30397373.0, + "step": 275 + }, + { + "epoch": 2.029520295202952, + "grad_norm": 0.5018401846949873, + "learning_rate": 9.308821727522626e-06, + "loss": 0.711, + "num_tokens": 30493389.0, + "step": 276 + }, + { + "epoch": 2.03690036900369, + "grad_norm": 0.5575170285871848, + "learning_rate": 9.303059344991519e-06, + "loss": 0.6267, + "num_tokens": 30601222.0, + "step": 277 + }, + { + "epoch": 2.044280442804428, + "grad_norm": 0.8923484894330787, + "learning_rate": 9.297275056951551e-06, + "loss": 0.7329, + "num_tokens": 30705969.0, + "step": 278 + }, + { + "epoch": 2.0516605166051662, + "grad_norm": 0.669629855434588, + "learning_rate": 9.291468896720045e-06, + "loss": 0.7182, + "num_tokens": 30800590.0, + "step": 279 + }, + { + "epoch": 2.059040590405904, + "grad_norm": 0.534127771451391, + "learning_rate": 9.285640897740316e-06, + "loss": 0.7151, + "num_tokens": 30933354.0, + "step": 280 + }, + { + "epoch": 2.066420664206642, + "grad_norm": 0.9857905397677859, + "learning_rate": 9.279791093581461e-06, + "loss": 0.9996, + "num_tokens": 31059251.0, + "step": 281 + }, + { + "epoch": 2.07380073800738, + "grad_norm": 0.4870382736213072, + "learning_rate": 9.27391951793818e-06, + "loss": 0.6776, + "num_tokens": 31174572.0, + "step": 282 + }, + { + "epoch": 2.081180811808118, + "grad_norm": 0.5506320918654295, + "learning_rate": 9.268026204630574e-06, + "loss": 0.6994, + "num_tokens": 31274898.0, + "step": 283 + }, + { + "epoch": 2.088560885608856, + "grad_norm": 0.5200515436580103, + "learning_rate": 9.262111187603953e-06, + "loss": 0.7309, + "num_tokens": 31395801.0, + "step": 284 + }, + { + "epoch": 2.095940959409594, + "grad_norm": 0.5513366405876152, + "learning_rate": 9.25617450092864e-06, + "loss": 0.6825, + "num_tokens": 31490903.0, + "step": 285 + }, + { + "epoch": 2.103321033210332, + "grad_norm": 0.5363819397451034, + "learning_rate": 9.250216178799772e-06, + "loss": 0.7638, + "num_tokens": 31603887.0, + "step": 286 + }, + { + "epoch": 2.11070110701107, + "grad_norm": 0.5423607210477918, + "learning_rate": 9.244236255537108e-06, + "loss": 0.699, + "num_tokens": 31714116.0, + "step": 287 + }, + { + "epoch": 2.1180811808118083, + "grad_norm": 0.5862396441020576, + "learning_rate": 9.23823476558483e-06, + "loss": 0.726, + "num_tokens": 31809657.0, + "step": 288 + }, + { + "epoch": 2.125461254612546, + "grad_norm": 0.6082666311294169, + "learning_rate": 9.23221174351134e-06, + "loss": 0.6569, + "num_tokens": 31903064.0, + "step": 289 + }, + { + "epoch": 2.132841328413284, + "grad_norm": 0.6670626481541114, + "learning_rate": 9.226167224009065e-06, + "loss": 0.735, + "num_tokens": 31977936.0, + "step": 290 + }, + { + "epoch": 2.140221402214022, + "grad_norm": 0.6340720630907666, + "learning_rate": 9.220101241894262e-06, + "loss": 0.7888, + "num_tokens": 32096002.0, + "step": 291 + }, + { + "epoch": 2.14760147601476, + "grad_norm": 0.5591180559329147, + "learning_rate": 9.214013832106806e-06, + "loss": 0.6877, + "num_tokens": 32200262.0, + "step": 292 + }, + { + "epoch": 2.1549815498154983, + "grad_norm": 0.5897306782384196, + "learning_rate": 9.207905029709996e-06, + "loss": 0.7687, + "num_tokens": 32304732.0, + "step": 293 + }, + { + "epoch": 2.162361623616236, + "grad_norm": 0.5283507246823136, + "learning_rate": 9.201774869890351e-06, + "loss": 0.664, + "num_tokens": 32402049.0, + "step": 294 + }, + { + "epoch": 2.169741697416974, + "grad_norm": 0.49182885764939005, + "learning_rate": 9.195623387957412e-06, + "loss": 0.6478, + "num_tokens": 32528738.0, + "step": 295 + }, + { + "epoch": 2.177121771217712, + "grad_norm": 0.5230520282166146, + "learning_rate": 9.18945061934353e-06, + "loss": 0.6481, + "num_tokens": 32633734.0, + "step": 296 + }, + { + "epoch": 2.1845018450184504, + "grad_norm": 0.505393942279067, + "learning_rate": 9.183256599603672e-06, + "loss": 0.7, + "num_tokens": 32744593.0, + "step": 297 + }, + { + "epoch": 2.1918819188191883, + "grad_norm": 0.6036841047133673, + "learning_rate": 9.177041364415203e-06, + "loss": 0.7305, + "num_tokens": 32857772.0, + "step": 298 + }, + { + "epoch": 2.199261992619926, + "grad_norm": 0.5464815698522134, + "learning_rate": 9.170804949577698e-06, + "loss": 0.7285, + "num_tokens": 32950999.0, + "step": 299 + }, + { + "epoch": 2.206642066420664, + "grad_norm": 0.586314392067132, + "learning_rate": 9.16454739101272e-06, + "loss": 0.6988, + "num_tokens": 33034720.0, + "step": 300 + }, + { + "epoch": 2.2140221402214024, + "grad_norm": 0.925180511120846, + "learning_rate": 9.158268724763615e-06, + "loss": 1.1771, + "num_tokens": 33189582.0, + "step": 301 + }, + { + "epoch": 2.2214022140221403, + "grad_norm": 0.5979161298544858, + "learning_rate": 9.151968986995322e-06, + "loss": 0.653, + "num_tokens": 33348052.0, + "step": 302 + }, + { + "epoch": 2.2287822878228782, + "grad_norm": 0.603845704491123, + "learning_rate": 9.14564821399414e-06, + "loss": 0.6487, + "num_tokens": 33447540.0, + "step": 303 + }, + { + "epoch": 2.236162361623616, + "grad_norm": 0.6191017874458337, + "learning_rate": 9.139306442167533e-06, + "loss": 0.7839, + "num_tokens": 33534583.0, + "step": 304 + }, + { + "epoch": 2.243542435424354, + "grad_norm": 0.5816464795523667, + "learning_rate": 9.132943708043919e-06, + "loss": 0.8154, + "num_tokens": 33648628.0, + "step": 305 + }, + { + "epoch": 2.2509225092250924, + "grad_norm": 0.5858083162521804, + "learning_rate": 9.126560048272457e-06, + "loss": 0.7596, + "num_tokens": 33751895.0, + "step": 306 + }, + { + "epoch": 2.2583025830258303, + "grad_norm": 0.5260445329669106, + "learning_rate": 9.12015549962284e-06, + "loss": 0.7344, + "num_tokens": 33878759.0, + "step": 307 + }, + { + "epoch": 2.265682656826568, + "grad_norm": 0.5328592013038843, + "learning_rate": 9.113730098985076e-06, + "loss": 0.6032, + "num_tokens": 33986222.0, + "step": 308 + }, + { + "epoch": 2.273062730627306, + "grad_norm": 0.5404895444418575, + "learning_rate": 9.10728388336928e-06, + "loss": 0.6752, + "num_tokens": 34084937.0, + "step": 309 + }, + { + "epoch": 2.280442804428044, + "grad_norm": 0.5165834237491866, + "learning_rate": 9.100816889905465e-06, + "loss": 0.6338, + "num_tokens": 34203999.0, + "step": 310 + }, + { + "epoch": 2.2878228782287824, + "grad_norm": 0.5079482408380165, + "learning_rate": 9.094329155843323e-06, + "loss": 0.6519, + "num_tokens": 34327452.0, + "step": 311 + }, + { + "epoch": 2.2952029520295203, + "grad_norm": 0.558757746271742, + "learning_rate": 9.087820718552006e-06, + "loss": 0.711, + "num_tokens": 34424365.0, + "step": 312 + }, + { + "epoch": 2.302583025830258, + "grad_norm": 0.56955775758271, + "learning_rate": 9.081291615519921e-06, + "loss": 0.6588, + "num_tokens": 34519556.0, + "step": 313 + }, + { + "epoch": 2.3099630996309966, + "grad_norm": 0.5571316363879791, + "learning_rate": 9.074741884354507e-06, + "loss": 0.7381, + "num_tokens": 34624816.0, + "step": 314 + }, + { + "epoch": 2.3173431734317345, + "grad_norm": 0.5525816487649007, + "learning_rate": 9.068171562782022e-06, + "loss": 0.6969, + "num_tokens": 34734817.0, + "step": 315 + }, + { + "epoch": 2.3247232472324724, + "grad_norm": 0.6353130260407771, + "learning_rate": 9.061580688647322e-06, + "loss": 0.7271, + "num_tokens": 34857095.0, + "step": 316 + }, + { + "epoch": 2.3321033210332103, + "grad_norm": 0.5044947960776338, + "learning_rate": 9.054969299913646e-06, + "loss": 0.6678, + "num_tokens": 34995390.0, + "step": 317 + }, + { + "epoch": 2.339483394833948, + "grad_norm": 0.5037735213732109, + "learning_rate": 9.048337434662398e-06, + "loss": 0.6462, + "num_tokens": 35122926.0, + "step": 318 + }, + { + "epoch": 2.3468634686346865, + "grad_norm": 0.540258518479221, + "learning_rate": 9.041685131092925e-06, + "loss": 0.6824, + "num_tokens": 35233600.0, + "step": 319 + }, + { + "epoch": 2.3542435424354244, + "grad_norm": 0.508870393111074, + "learning_rate": 9.035012427522296e-06, + "loss": 0.6109, + "num_tokens": 35336155.0, + "step": 320 + }, + { + "epoch": 2.3616236162361623, + "grad_norm": 0.5095273169618749, + "learning_rate": 9.028319362385088e-06, + "loss": 0.6582, + "num_tokens": 35455988.0, + "step": 321 + }, + { + "epoch": 2.3690036900369003, + "grad_norm": 0.5901762579618401, + "learning_rate": 9.021605974233153e-06, + "loss": 0.6647, + "num_tokens": 35538087.0, + "step": 322 + }, + { + "epoch": 2.376383763837638, + "grad_norm": 0.49466058873563995, + "learning_rate": 9.014872301735412e-06, + "loss": 0.6884, + "num_tokens": 35705951.0, + "step": 323 + }, + { + "epoch": 2.3837638376383765, + "grad_norm": 0.5962756574638172, + "learning_rate": 9.008118383677618e-06, + "loss": 0.7109, + "num_tokens": 35807170.0, + "step": 324 + }, + { + "epoch": 2.3911439114391144, + "grad_norm": 0.465779093896106, + "learning_rate": 9.001344258962134e-06, + "loss": 0.6042, + "num_tokens": 35925122.0, + "step": 325 + }, + { + "epoch": 2.3985239852398523, + "grad_norm": 0.5839163750401734, + "learning_rate": 8.994549966607723e-06, + "loss": 0.6991, + "num_tokens": 36014729.0, + "step": 326 + }, + { + "epoch": 2.4059040590405902, + "grad_norm": 0.5124430350072646, + "learning_rate": 8.987735545749304e-06, + "loss": 0.6827, + "num_tokens": 36139300.0, + "step": 327 + }, + { + "epoch": 2.4132841328413286, + "grad_norm": 0.5661550803778267, + "learning_rate": 8.980901035637739e-06, + "loss": 0.7175, + "num_tokens": 36235122.0, + "step": 328 + }, + { + "epoch": 2.4206642066420665, + "grad_norm": 0.5589617880405421, + "learning_rate": 8.974046475639605e-06, + "loss": 0.6639, + "num_tokens": 36342683.0, + "step": 329 + }, + { + "epoch": 2.4280442804428044, + "grad_norm": 0.4908227738995962, + "learning_rate": 8.96717190523696e-06, + "loss": 0.6748, + "num_tokens": 36457622.0, + "step": 330 + }, + { + "epoch": 2.4354243542435423, + "grad_norm": 0.578544996079008, + "learning_rate": 8.96027736402713e-06, + "loss": 0.6712, + "num_tokens": 36550707.0, + "step": 331 + }, + { + "epoch": 2.4428044280442807, + "grad_norm": 0.5474890587680412, + "learning_rate": 8.953362891722464e-06, + "loss": 0.6869, + "num_tokens": 36669279.0, + "step": 332 + }, + { + "epoch": 2.4501845018450186, + "grad_norm": 0.4724321116407533, + "learning_rate": 8.94642852815012e-06, + "loss": 0.5751, + "num_tokens": 36793295.0, + "step": 333 + }, + { + "epoch": 2.4575645756457565, + "grad_norm": 0.606460496382911, + "learning_rate": 8.939474313251824e-06, + "loss": 0.7516, + "num_tokens": 36877447.0, + "step": 334 + }, + { + "epoch": 2.4649446494464944, + "grad_norm": 0.6045532758033616, + "learning_rate": 8.932500287083647e-06, + "loss": 0.773, + "num_tokens": 36982748.0, + "step": 335 + }, + { + "epoch": 2.4723247232472323, + "grad_norm": 0.5065444389875737, + "learning_rate": 8.925506489815773e-06, + "loss": 0.636, + "num_tokens": 37094162.0, + "step": 336 + }, + { + "epoch": 2.4797047970479706, + "grad_norm": 0.5799022450280746, + "learning_rate": 8.918492961732268e-06, + "loss": 0.7163, + "num_tokens": 37203237.0, + "step": 337 + }, + { + "epoch": 2.4870848708487086, + "grad_norm": 0.49720680793473937, + "learning_rate": 8.911459743230844e-06, + "loss": 0.6905, + "num_tokens": 37327724.0, + "step": 338 + }, + { + "epoch": 2.4944649446494465, + "grad_norm": 0.5621911034313891, + "learning_rate": 8.904406874822633e-06, + "loss": 0.7011, + "num_tokens": 37422462.0, + "step": 339 + }, + { + "epoch": 2.5018450184501844, + "grad_norm": 0.5993381292011397, + "learning_rate": 8.897334397131945e-06, + "loss": 0.745, + "num_tokens": 37527002.0, + "step": 340 + }, + { + "epoch": 2.5092250922509223, + "grad_norm": 0.5524534340458445, + "learning_rate": 8.89024235089604e-06, + "loss": 0.6314, + "num_tokens": 37644410.0, + "step": 341 + }, + { + "epoch": 2.5166051660516606, + "grad_norm": 0.5968119241632962, + "learning_rate": 8.883130776964896e-06, + "loss": 0.6736, + "num_tokens": 37740096.0, + "step": 342 + }, + { + "epoch": 2.5239852398523985, + "grad_norm": 0.5653506959798167, + "learning_rate": 8.875999716300969e-06, + "loss": 0.6653, + "num_tokens": 37829867.0, + "step": 343 + }, + { + "epoch": 2.5313653136531364, + "grad_norm": 0.5699516231964283, + "learning_rate": 8.868849209978954e-06, + "loss": 0.6591, + "num_tokens": 37924329.0, + "step": 344 + }, + { + "epoch": 2.538745387453875, + "grad_norm": 0.5710685664535915, + "learning_rate": 8.861679299185557e-06, + "loss": 0.7203, + "num_tokens": 38058678.0, + "step": 345 + }, + { + "epoch": 2.5461254612546127, + "grad_norm": 0.4616390155799769, + "learning_rate": 8.85449002521925e-06, + "loss": 0.6561, + "num_tokens": 38202792.0, + "step": 346 + }, + { + "epoch": 2.5535055350553506, + "grad_norm": 0.5849649802018279, + "learning_rate": 8.847281429490037e-06, + "loss": 0.6766, + "num_tokens": 38306259.0, + "step": 347 + }, + { + "epoch": 2.5608856088560885, + "grad_norm": 0.5386758574010692, + "learning_rate": 8.840053553519216e-06, + "loss": 0.6315, + "num_tokens": 38413647.0, + "step": 348 + }, + { + "epoch": 2.5682656826568264, + "grad_norm": 0.49170936889455386, + "learning_rate": 8.832806438939137e-06, + "loss": 0.6465, + "num_tokens": 38544977.0, + "step": 349 + }, + { + "epoch": 2.5756457564575648, + "grad_norm": 0.5775622051020965, + "learning_rate": 8.825540127492966e-06, + "loss": 0.7501, + "num_tokens": 38650617.0, + "step": 350 + }, + { + "epoch": 2.5830258302583027, + "grad_norm": 0.4843732761430568, + "learning_rate": 8.818254661034442e-06, + "loss": 0.6355, + "num_tokens": 38821487.0, + "step": 351 + }, + { + "epoch": 2.5904059040590406, + "grad_norm": 0.5647579621437209, + "learning_rate": 8.810950081527633e-06, + "loss": 0.7156, + "num_tokens": 38947813.0, + "step": 352 + }, + { + "epoch": 2.5977859778597785, + "grad_norm": 0.5087818689310419, + "learning_rate": 8.803626431046703e-06, + "loss": 0.6901, + "num_tokens": 39061069.0, + "step": 353 + }, + { + "epoch": 2.6051660516605164, + "grad_norm": 0.5037922071125235, + "learning_rate": 8.796283751775657e-06, + "loss": 0.6543, + "num_tokens": 39188302.0, + "step": 354 + }, + { + "epoch": 2.6125461254612548, + "grad_norm": 1.0060863716469244, + "learning_rate": 8.78892208600811e-06, + "loss": 0.7447, + "num_tokens": 39283955.0, + "step": 355 + }, + { + "epoch": 2.6199261992619927, + "grad_norm": 0.5341956449839068, + "learning_rate": 8.781541476147043e-06, + "loss": 0.6923, + "num_tokens": 39401219.0, + "step": 356 + }, + { + "epoch": 2.6273062730627306, + "grad_norm": 3.805500375286104, + "learning_rate": 8.774141964704547e-06, + "loss": 0.5917, + "num_tokens": 39537217.0, + "step": 357 + }, + { + "epoch": 2.6346863468634685, + "grad_norm": 0.5309380227267634, + "learning_rate": 8.766723594301585e-06, + "loss": 0.7479, + "num_tokens": 39655525.0, + "step": 358 + }, + { + "epoch": 2.6420664206642064, + "grad_norm": 0.54834990506598, + "learning_rate": 8.759286407667755e-06, + "loss": 0.7124, + "num_tokens": 39753518.0, + "step": 359 + }, + { + "epoch": 2.6494464944649447, + "grad_norm": 0.5247565829789032, + "learning_rate": 8.751830447641028e-06, + "loss": 0.6959, + "num_tokens": 39872001.0, + "step": 360 + }, + { + "epoch": 2.6568265682656826, + "grad_norm": 0.5528126055251835, + "learning_rate": 8.744355757167513e-06, + "loss": 0.7219, + "num_tokens": 39973085.0, + "step": 361 + }, + { + "epoch": 2.6642066420664205, + "grad_norm": 0.7178953596915454, + "learning_rate": 8.736862379301205e-06, + "loss": 1.1287, + "num_tokens": 40117640.0, + "step": 362 + }, + { + "epoch": 2.671586715867159, + "grad_norm": 0.5590814924293989, + "learning_rate": 8.72935035720374e-06, + "loss": 0.6442, + "num_tokens": 40243111.0, + "step": 363 + }, + { + "epoch": 2.678966789667897, + "grad_norm": 0.5858386874871219, + "learning_rate": 8.721819734144137e-06, + "loss": 0.7624, + "num_tokens": 40340644.0, + "step": 364 + }, + { + "epoch": 2.6863468634686347, + "grad_norm": 0.5439140593105536, + "learning_rate": 8.714270553498567e-06, + "loss": 0.8634, + "num_tokens": 40459017.0, + "step": 365 + }, + { + "epoch": 2.6937269372693726, + "grad_norm": 0.5696734249674251, + "learning_rate": 8.706702858750084e-06, + "loss": 0.6883, + "num_tokens": 40567170.0, + "step": 366 + }, + { + "epoch": 2.7011070110701105, + "grad_norm": 0.540221339754704, + "learning_rate": 8.699116693488383e-06, + "loss": 0.6701, + "num_tokens": 40689551.0, + "step": 367 + }, + { + "epoch": 2.708487084870849, + "grad_norm": 0.5268135413989117, + "learning_rate": 8.691512101409553e-06, + "loss": 0.7441, + "num_tokens": 40808209.0, + "step": 368 + }, + { + "epoch": 2.715867158671587, + "grad_norm": 0.5779813810874505, + "learning_rate": 8.68388912631582e-06, + "loss": 0.6434, + "num_tokens": 40920204.0, + "step": 369 + }, + { + "epoch": 2.7232472324723247, + "grad_norm": 0.5715358388081127, + "learning_rate": 8.676247812115288e-06, + "loss": 1.0132, + "num_tokens": 41052491.0, + "step": 370 + }, + { + "epoch": 2.7306273062730626, + "grad_norm": 0.5631079744811832, + "learning_rate": 8.668588202821708e-06, + "loss": 0.6923, + "num_tokens": 41148568.0, + "step": 371 + }, + { + "epoch": 2.7380073800738005, + "grad_norm": 0.6059248747102125, + "learning_rate": 8.660910342554194e-06, + "loss": 0.6413, + "num_tokens": 41229823.0, + "step": 372 + }, + { + "epoch": 2.745387453874539, + "grad_norm": 0.5568122522506863, + "learning_rate": 8.653214275537e-06, + "loss": 0.7091, + "num_tokens": 41350523.0, + "step": 373 + }, + { + "epoch": 2.7527675276752768, + "grad_norm": 0.5156740799977944, + "learning_rate": 8.645500046099237e-06, + "loss": 0.6046, + "num_tokens": 41461892.0, + "step": 374 + }, + { + "epoch": 2.7601476014760147, + "grad_norm": 0.5328035035712879, + "learning_rate": 8.637767698674642e-06, + "loss": 0.7124, + "num_tokens": 41599826.0, + "step": 375 + }, + { + "epoch": 2.767527675276753, + "grad_norm": 0.6395151238615233, + "learning_rate": 8.630017277801306e-06, + "loss": 0.6799, + "num_tokens": 41684056.0, + "step": 376 + }, + { + "epoch": 2.774907749077491, + "grad_norm": 0.5718677138308345, + "learning_rate": 8.62224882812142e-06, + "loss": 0.6698, + "num_tokens": 41789363.0, + "step": 377 + }, + { + "epoch": 2.782287822878229, + "grad_norm": 0.5745655870409727, + "learning_rate": 8.614462394381028e-06, + "loss": 0.6536, + "num_tokens": 41902328.0, + "step": 378 + }, + { + "epoch": 2.7896678966789668, + "grad_norm": 0.5962600314891544, + "learning_rate": 8.606658021429754e-06, + "loss": 0.7095, + "num_tokens": 42013036.0, + "step": 379 + }, + { + "epoch": 2.7970479704797047, + "grad_norm": 0.589009128309455, + "learning_rate": 8.598835754220554e-06, + "loss": 0.7223, + "num_tokens": 42142939.0, + "step": 380 + }, + { + "epoch": 2.804428044280443, + "grad_norm": 0.6135753550691891, + "learning_rate": 8.590995637809459e-06, + "loss": 0.6928, + "num_tokens": 42225463.0, + "step": 381 + }, + { + "epoch": 2.811808118081181, + "grad_norm": 0.6250756573123484, + "learning_rate": 8.5831377173553e-06, + "loss": 0.7181, + "num_tokens": 42307416.0, + "step": 382 + }, + { + "epoch": 2.819188191881919, + "grad_norm": 0.5798107659221702, + "learning_rate": 8.575262038119468e-06, + "loss": 0.8076, + "num_tokens": 42427179.0, + "step": 383 + }, + { + "epoch": 2.8265682656826567, + "grad_norm": 0.531136083790205, + "learning_rate": 8.567368645465646e-06, + "loss": 0.6446, + "num_tokens": 42530634.0, + "step": 384 + }, + { + "epoch": 2.8339483394833946, + "grad_norm": 0.5527830503779945, + "learning_rate": 8.559457584859537e-06, + "loss": 0.7242, + "num_tokens": 42637999.0, + "step": 385 + }, + { + "epoch": 2.841328413284133, + "grad_norm": 0.5614225280721861, + "learning_rate": 8.551528901868614e-06, + "loss": 0.6411, + "num_tokens": 42728403.0, + "step": 386 + }, + { + "epoch": 2.848708487084871, + "grad_norm": 0.5346554017745229, + "learning_rate": 8.543582642161857e-06, + "loss": 0.6717, + "num_tokens": 42838803.0, + "step": 387 + }, + { + "epoch": 2.856088560885609, + "grad_norm": 0.6029067731047304, + "learning_rate": 8.535618851509487e-06, + "loss": 0.7271, + "num_tokens": 42952058.0, + "step": 388 + }, + { + "epoch": 2.8634686346863467, + "grad_norm": 0.5218810595386072, + "learning_rate": 8.5276375757827e-06, + "loss": 0.5411, + "num_tokens": 43052733.0, + "step": 389 + }, + { + "epoch": 2.8708487084870846, + "grad_norm": 0.6781573031807308, + "learning_rate": 8.519638860953408e-06, + "loss": 0.747, + "num_tokens": 43127428.0, + "step": 390 + }, + { + "epoch": 2.878228782287823, + "grad_norm": 0.5474215701215952, + "learning_rate": 8.511622753093971e-06, + "loss": 0.6845, + "num_tokens": 43245146.0, + "step": 391 + }, + { + "epoch": 2.885608856088561, + "grad_norm": 0.5828226303070865, + "learning_rate": 8.503589298376931e-06, + "loss": 0.6714, + "num_tokens": 43343029.0, + "step": 392 + }, + { + "epoch": 2.892988929889299, + "grad_norm": 0.5566542613181935, + "learning_rate": 8.49553854307475e-06, + "loss": 0.7581, + "num_tokens": 43449381.0, + "step": 393 + }, + { + "epoch": 2.900369003690037, + "grad_norm": 0.5143312406285538, + "learning_rate": 8.48747053355954e-06, + "loss": 0.7023, + "num_tokens": 43605262.0, + "step": 394 + }, + { + "epoch": 2.907749077490775, + "grad_norm": 0.5796296239031932, + "learning_rate": 8.479385316302793e-06, + "loss": 0.6038, + "num_tokens": 43695066.0, + "step": 395 + }, + { + "epoch": 2.915129151291513, + "grad_norm": 0.5717544123390733, + "learning_rate": 8.47128293787512e-06, + "loss": 0.632, + "num_tokens": 43808970.0, + "step": 396 + }, + { + "epoch": 2.922509225092251, + "grad_norm": 0.5870739135203897, + "learning_rate": 8.463163444945986e-06, + "loss": 0.7216, + "num_tokens": 43933194.0, + "step": 397 + }, + { + "epoch": 2.9298892988929888, + "grad_norm": 0.5409023547367442, + "learning_rate": 8.455026884283424e-06, + "loss": 0.65, + "num_tokens": 44043681.0, + "step": 398 + }, + { + "epoch": 2.937269372693727, + "grad_norm": 0.5728886078954685, + "learning_rate": 8.446873302753783e-06, + "loss": 0.756, + "num_tokens": 44162249.0, + "step": 399 + }, + { + "epoch": 2.944649446494465, + "grad_norm": 0.5573913061191518, + "learning_rate": 8.43870274732145e-06, + "loss": 0.7583, + "num_tokens": 44275749.0, + "step": 400 + }, + { + "epoch": 2.952029520295203, + "grad_norm": 0.6759949052252414, + "learning_rate": 8.430515265048584e-06, + "loss": 0.7366, + "num_tokens": 44363141.0, + "step": 401 + }, + { + "epoch": 2.959409594095941, + "grad_norm": 0.5658923076568075, + "learning_rate": 8.422310903094836e-06, + "loss": 0.7266, + "num_tokens": 44460736.0, + "step": 402 + }, + { + "epoch": 2.9667896678966788, + "grad_norm": 0.5534862140926422, + "learning_rate": 8.41408970871709e-06, + "loss": 0.6982, + "num_tokens": 44555897.0, + "step": 403 + }, + { + "epoch": 2.974169741697417, + "grad_norm": 0.5525386920550562, + "learning_rate": 8.405851729269179e-06, + "loss": 0.6984, + "num_tokens": 44660818.0, + "step": 404 + }, + { + "epoch": 2.981549815498155, + "grad_norm": 0.5467576384243312, + "learning_rate": 8.39759701220162e-06, + "loss": 0.7544, + "num_tokens": 44775293.0, + "step": 405 + }, + { + "epoch": 2.988929889298893, + "grad_norm": 0.5762236970584276, + "learning_rate": 8.389325605061343e-06, + "loss": 0.7656, + "num_tokens": 44895620.0, + "step": 406 + }, + { + "epoch": 2.9963099630996313, + "grad_norm": 0.5415184898333844, + "learning_rate": 8.381037555491401e-06, + "loss": 0.7289, + "num_tokens": 45008883.0, + "step": 407 + }, + { + "epoch": 3.0, + "grad_norm": 0.5415184898333844, + "learning_rate": 8.372732911230717e-06, + "loss": 0.742, + "num_tokens": 45056415.0, + "step": 408 + }, + { + "epoch": 3.007380073800738, + "grad_norm": 1.0671420058937142, + "learning_rate": 8.364411720113794e-06, + "loss": 0.6045, + "num_tokens": 45189762.0, + "step": 409 + }, + { + "epoch": 3.014760147601476, + "grad_norm": 0.7014142965782201, + "learning_rate": 8.356074030070447e-06, + "loss": 0.5984, + "num_tokens": 45301171.0, + "step": 410 + }, + { + "epoch": 3.022140221402214, + "grad_norm": 0.5815507535960692, + "learning_rate": 8.347719889125521e-06, + "loss": 0.5508, + "num_tokens": 45408595.0, + "step": 411 + }, + { + "epoch": 3.029520295202952, + "grad_norm": 0.6209554212604178, + "learning_rate": 8.339349345398622e-06, + "loss": 0.559, + "num_tokens": 45534003.0, + "step": 412 + }, + { + "epoch": 3.03690036900369, + "grad_norm": 0.5657155409020301, + "learning_rate": 8.33096244710383e-06, + "loss": 0.5474, + "num_tokens": 45687169.0, + "step": 413 + }, + { + "epoch": 3.044280442804428, + "grad_norm": 0.7085556124145286, + "learning_rate": 8.322559242549435e-06, + "loss": 0.5636, + "num_tokens": 45783210.0, + "step": 414 + }, + { + "epoch": 3.0516605166051662, + "grad_norm": 0.8664457929036407, + "learning_rate": 8.31413978013764e-06, + "loss": 0.4727, + "num_tokens": 45880811.0, + "step": 415 + }, + { + "epoch": 3.059040590405904, + "grad_norm": 0.7915254557523631, + "learning_rate": 8.305704108364301e-06, + "loss": 0.5057, + "num_tokens": 46016909.0, + "step": 416 + }, + { + "epoch": 3.066420664206642, + "grad_norm": 0.6904619869684991, + "learning_rate": 8.297252275818639e-06, + "loss": 0.5403, + "num_tokens": 46154743.0, + "step": 417 + }, + { + "epoch": 3.07380073800738, + "grad_norm": 0.6704294101177389, + "learning_rate": 8.288784331182954e-06, + "loss": 0.5402, + "num_tokens": 46253315.0, + "step": 418 + }, + { + "epoch": 3.081180811808118, + "grad_norm": 0.7803874824846486, + "learning_rate": 8.280300323232361e-06, + "loss": 1.0731, + "num_tokens": 46386831.0, + "step": 419 + }, + { + "epoch": 3.088560885608856, + "grad_norm": 0.6204587386033913, + "learning_rate": 8.271800300834488e-06, + "loss": 0.5226, + "num_tokens": 46486510.0, + "step": 420 + }, + { + "epoch": 3.095940959409594, + "grad_norm": 0.6805498120059815, + "learning_rate": 8.263284312949215e-06, + "loss": 0.5697, + "num_tokens": 46567527.0, + "step": 421 + }, + { + "epoch": 3.103321033210332, + "grad_norm": 0.6085858333682164, + "learning_rate": 8.254752408628378e-06, + "loss": 0.5771, + "num_tokens": 46696494.0, + "step": 422 + }, + { + "epoch": 3.11070110701107, + "grad_norm": 0.5450373938151896, + "learning_rate": 8.246204637015494e-06, + "loss": 0.5062, + "num_tokens": 46810646.0, + "step": 423 + }, + { + "epoch": 3.1180811808118083, + "grad_norm": 0.5420317314750567, + "learning_rate": 8.237641047345473e-06, + "loss": 0.5446, + "num_tokens": 46928998.0, + "step": 424 + }, + { + "epoch": 3.125461254612546, + "grad_norm": 0.7244311467151333, + "learning_rate": 8.229061688944335e-06, + "loss": 0.5372, + "num_tokens": 47023410.0, + "step": 425 + }, + { + "epoch": 3.132841328413284, + "grad_norm": 0.610314609252109, + "learning_rate": 8.220466611228931e-06, + "loss": 0.4839, + "num_tokens": 47154761.0, + "step": 426 + }, + { + "epoch": 3.140221402214022, + "grad_norm": 0.8006612716945156, + "learning_rate": 8.211855863706654e-06, + "loss": 0.5217, + "num_tokens": 47270799.0, + "step": 427 + }, + { + "epoch": 3.14760147601476, + "grad_norm": 0.592305967306938, + "learning_rate": 8.203229495975154e-06, + "loss": 0.4979, + "num_tokens": 47451649.0, + "step": 428 + }, + { + "epoch": 3.1549815498154983, + "grad_norm": 0.5474418113355926, + "learning_rate": 8.194587557722053e-06, + "loss": 0.5136, + "num_tokens": 47551002.0, + "step": 429 + }, + { + "epoch": 3.162361623616236, + "grad_norm": 0.6660539332965522, + "learning_rate": 8.185930098724657e-06, + "loss": 0.5439, + "num_tokens": 47654318.0, + "step": 430 + }, + { + "epoch": 3.169741697416974, + "grad_norm": 0.6382441671849843, + "learning_rate": 8.177257168849673e-06, + "loss": 0.5595, + "num_tokens": 47797604.0, + "step": 431 + }, + { + "epoch": 3.177121771217712, + "grad_norm": 0.5849382339843232, + "learning_rate": 8.168568818052924e-06, + "loss": 0.5581, + "num_tokens": 47917575.0, + "step": 432 + }, + { + "epoch": 3.1845018450184504, + "grad_norm": 0.5770792253720011, + "learning_rate": 8.159865096379046e-06, + "loss": 0.4999, + "num_tokens": 47993898.0, + "step": 433 + }, + { + "epoch": 3.1918819188191883, + "grad_norm": 0.6995519838702828, + "learning_rate": 8.151146053961218e-06, + "loss": 0.5878, + "num_tokens": 48111580.0, + "step": 434 + }, + { + "epoch": 3.199261992619926, + "grad_norm": 0.7163630765514822, + "learning_rate": 8.142411741020872e-06, + "loss": 0.4991, + "num_tokens": 48200358.0, + "step": 435 + }, + { + "epoch": 3.206642066420664, + "grad_norm": 0.6392968082830581, + "learning_rate": 8.133662207867383e-06, + "loss": 0.5478, + "num_tokens": 48294625.0, + "step": 436 + }, + { + "epoch": 3.2140221402214024, + "grad_norm": 0.6354099390161857, + "learning_rate": 8.124897504897806e-06, + "loss": 0.746, + "num_tokens": 48395339.0, + "step": 437 + }, + { + "epoch": 3.2214022140221403, + "grad_norm": 0.8664104426738044, + "learning_rate": 8.116117682596571e-06, + "loss": 0.6143, + "num_tokens": 48480560.0, + "step": 438 + }, + { + "epoch": 3.2287822878228782, + "grad_norm": 0.7067089339309491, + "learning_rate": 8.10732279153519e-06, + "loss": 0.5595, + "num_tokens": 48605710.0, + "step": 439 + }, + { + "epoch": 3.236162361623616, + "grad_norm": 0.562171737471794, + "learning_rate": 8.098512882371977e-06, + "loss": 0.5264, + "num_tokens": 48734584.0, + "step": 440 + }, + { + "epoch": 3.243542435424354, + "grad_norm": 0.552023532088518, + "learning_rate": 8.089688005851746e-06, + "loss": 0.4797, + "num_tokens": 48879534.0, + "step": 441 + }, + { + "epoch": 3.2509225092250924, + "grad_norm": 0.5902870119743631, + "learning_rate": 8.080848212805526e-06, + "loss": 0.5482, + "num_tokens": 48983752.0, + "step": 442 + }, + { + "epoch": 3.2583025830258303, + "grad_norm": 0.6035750241527118, + "learning_rate": 8.071993554150258e-06, + "loss": 0.535, + "num_tokens": 49097804.0, + "step": 443 + }, + { + "epoch": 3.265682656826568, + "grad_norm": 0.7048045755523692, + "learning_rate": 8.063124080888514e-06, + "loss": 0.513, + "num_tokens": 49212376.0, + "step": 444 + }, + { + "epoch": 3.273062730627306, + "grad_norm": 0.6574564335668868, + "learning_rate": 8.0542398441082e-06, + "loss": 0.5927, + "num_tokens": 49330960.0, + "step": 445 + }, + { + "epoch": 3.280442804428044, + "grad_norm": 0.663758732630871, + "learning_rate": 8.045340894982254e-06, + "loss": 0.5954, + "num_tokens": 49429078.0, + "step": 446 + }, + { + "epoch": 3.2878228782287824, + "grad_norm": 0.6512328570912242, + "learning_rate": 8.036427284768357e-06, + "loss": 0.5032, + "num_tokens": 49541765.0, + "step": 447 + }, + { + "epoch": 3.2952029520295203, + "grad_norm": 0.5798248571379592, + "learning_rate": 8.027499064808642e-06, + "loss": 0.4804, + "num_tokens": 49653291.0, + "step": 448 + }, + { + "epoch": 3.302583025830258, + "grad_norm": 0.614467290698762, + "learning_rate": 8.018556286529387e-06, + "loss": 0.5653, + "num_tokens": 49761203.0, + "step": 449 + }, + { + "epoch": 3.3099630996309966, + "grad_norm": 0.5686386180748071, + "learning_rate": 8.009599001440733e-06, + "loss": 0.4794, + "num_tokens": 49912863.0, + "step": 450 + }, + { + "epoch": 3.3173431734317345, + "grad_norm": 0.6032697718012249, + "learning_rate": 8.000627261136375e-06, + "loss": 0.5175, + "num_tokens": 50011025.0, + "step": 451 + }, + { + "epoch": 3.3247232472324724, + "grad_norm": 0.6065988231357294, + "learning_rate": 7.991641117293267e-06, + "loss": 0.5121, + "num_tokens": 50132098.0, + "step": 452 + }, + { + "epoch": 3.3321033210332103, + "grad_norm": 0.6562165232005108, + "learning_rate": 7.982640621671336e-06, + "loss": 0.6779, + "num_tokens": 50220571.0, + "step": 453 + }, + { + "epoch": 3.339483394833948, + "grad_norm": 0.6967924204298649, + "learning_rate": 7.973625826113167e-06, + "loss": 0.5329, + "num_tokens": 50350796.0, + "step": 454 + }, + { + "epoch": 3.3468634686346865, + "grad_norm": 0.5798961290769074, + "learning_rate": 7.964596782543717e-06, + "loss": 0.6361, + "num_tokens": 50460299.0, + "step": 455 + }, + { + "epoch": 3.3542435424354244, + "grad_norm": 0.5621999963338501, + "learning_rate": 7.955553542970003e-06, + "loss": 0.5423, + "num_tokens": 50585790.0, + "step": 456 + }, + { + "epoch": 3.3616236162361623, + "grad_norm": 0.635123869146959, + "learning_rate": 7.94649615948082e-06, + "loss": 0.5394, + "num_tokens": 50682415.0, + "step": 457 + }, + { + "epoch": 3.3690036900369003, + "grad_norm": 0.5943185984471594, + "learning_rate": 7.93742468424643e-06, + "loss": 0.4831, + "num_tokens": 50782466.0, + "step": 458 + }, + { + "epoch": 3.376383763837638, + "grad_norm": 0.6890091213779607, + "learning_rate": 7.928339169518257e-06, + "loss": 0.5852, + "num_tokens": 50883210.0, + "step": 459 + }, + { + "epoch": 3.3837638376383765, + "grad_norm": 0.6414354230918646, + "learning_rate": 7.9192396676286e-06, + "loss": 0.5412, + "num_tokens": 50988218.0, + "step": 460 + }, + { + "epoch": 3.3911439114391144, + "grad_norm": 0.6566503833612184, + "learning_rate": 7.910126230990313e-06, + "loss": 0.5438, + "num_tokens": 51090015.0, + "step": 461 + }, + { + "epoch": 3.3985239852398523, + "grad_norm": 0.6376977950559205, + "learning_rate": 7.900998912096528e-06, + "loss": 0.5139, + "num_tokens": 51214982.0, + "step": 462 + }, + { + "epoch": 3.4059040590405902, + "grad_norm": 0.6764222073349121, + "learning_rate": 7.891857763520327e-06, + "loss": 0.5585, + "num_tokens": 51325827.0, + "step": 463 + }, + { + "epoch": 3.4132841328413286, + "grad_norm": 0.600654311601057, + "learning_rate": 7.882702837914455e-06, + "loss": 0.5435, + "num_tokens": 51450148.0, + "step": 464 + }, + { + "epoch": 3.4206642066420665, + "grad_norm": 0.6744841627045786, + "learning_rate": 7.873534188011009e-06, + "loss": 0.5234, + "num_tokens": 51543000.0, + "step": 465 + }, + { + "epoch": 3.4280442804428044, + "grad_norm": 0.6170707900142415, + "learning_rate": 7.864351866621143e-06, + "loss": 0.6331, + "num_tokens": 51649095.0, + "step": 466 + }, + { + "epoch": 3.4354243542435423, + "grad_norm": 0.6536867528779, + "learning_rate": 7.855155926634755e-06, + "loss": 0.5194, + "num_tokens": 51747999.0, + "step": 467 + }, + { + "epoch": 3.4428044280442807, + "grad_norm": 0.6685792006676133, + "learning_rate": 7.845946421020186e-06, + "loss": 0.4761, + "num_tokens": 51840725.0, + "step": 468 + }, + { + "epoch": 3.4501845018450186, + "grad_norm": 0.583372186794409, + "learning_rate": 7.836723402823913e-06, + "loss": 0.5723, + "num_tokens": 51938008.0, + "step": 469 + }, + { + "epoch": 3.4575645756457565, + "grad_norm": 0.6457686614718543, + "learning_rate": 7.82748692517025e-06, + "loss": 0.5624, + "num_tokens": 52027007.0, + "step": 470 + }, + { + "epoch": 3.4649446494464944, + "grad_norm": 0.6192253028170305, + "learning_rate": 7.818237041261032e-06, + "loss": 0.5552, + "num_tokens": 52148401.0, + "step": 471 + }, + { + "epoch": 3.4723247232472323, + "grad_norm": 0.6157880744622222, + "learning_rate": 7.808973804375318e-06, + "loss": 0.5367, + "num_tokens": 52251924.0, + "step": 472 + }, + { + "epoch": 3.4797047970479706, + "grad_norm": 0.585562845678871, + "learning_rate": 7.799697267869073e-06, + "loss": 0.5716, + "num_tokens": 52364563.0, + "step": 473 + }, + { + "epoch": 3.4870848708487086, + "grad_norm": 0.5810620097720413, + "learning_rate": 7.790407485174873e-06, + "loss": 0.4792, + "num_tokens": 52491358.0, + "step": 474 + }, + { + "epoch": 3.4944649446494465, + "grad_norm": 0.7063772145482928, + "learning_rate": 7.781104509801594e-06, + "loss": 0.5479, + "num_tokens": 52581166.0, + "step": 475 + }, + { + "epoch": 3.5018450184501844, + "grad_norm": 0.6787021375520482, + "learning_rate": 7.771788395334096e-06, + "loss": 0.5712, + "num_tokens": 52703377.0, + "step": 476 + }, + { + "epoch": 3.5092250922509223, + "grad_norm": 0.6608323171116782, + "learning_rate": 7.762459195432917e-06, + "loss": 0.4981, + "num_tokens": 52823591.0, + "step": 477 + }, + { + "epoch": 3.5166051660516606, + "grad_norm": 0.6684994354546132, + "learning_rate": 7.753116963833977e-06, + "loss": 0.5228, + "num_tokens": 52922716.0, + "step": 478 + }, + { + "epoch": 3.5239852398523985, + "grad_norm": 0.685837014810044, + "learning_rate": 7.74376175434825e-06, + "loss": 0.5645, + "num_tokens": 53027206.0, + "step": 479 + }, + { + "epoch": 3.5313653136531364, + "grad_norm": 0.533647870419083, + "learning_rate": 7.734393620861467e-06, + "loss": 0.4529, + "num_tokens": 53147758.0, + "step": 480 + }, + { + "epoch": 3.538745387453875, + "grad_norm": 0.6233894301978303, + "learning_rate": 7.725012617333796e-06, + "loss": 0.5843, + "num_tokens": 53260966.0, + "step": 481 + }, + { + "epoch": 3.5461254612546127, + "grad_norm": 0.7704460497456811, + "learning_rate": 7.71561879779954e-06, + "loss": 0.533, + "num_tokens": 53348564.0, + "step": 482 + }, + { + "epoch": 3.5535055350553506, + "grad_norm": 0.5821959035028632, + "learning_rate": 7.706212216366821e-06, + "loss": 0.4733, + "num_tokens": 53502326.0, + "step": 483 + }, + { + "epoch": 3.5608856088560885, + "grad_norm": 0.5912639182099809, + "learning_rate": 7.696792927217266e-06, + "loss": 0.6074, + "num_tokens": 53621069.0, + "step": 484 + }, + { + "epoch": 3.5682656826568264, + "grad_norm": 0.7008824003925574, + "learning_rate": 7.687360984605705e-06, + "loss": 0.5761, + "num_tokens": 53713397.0, + "step": 485 + }, + { + "epoch": 3.5756457564575648, + "grad_norm": 0.640550494460841, + "learning_rate": 7.677916442859843e-06, + "loss": 0.5743, + "num_tokens": 53811627.0, + "step": 486 + }, + { + "epoch": 3.5830258302583027, + "grad_norm": 0.697013155404617, + "learning_rate": 7.66845935637996e-06, + "loss": 0.5367, + "num_tokens": 53910389.0, + "step": 487 + }, + { + "epoch": 3.5904059040590406, + "grad_norm": 0.6260702071794642, + "learning_rate": 7.658989779638599e-06, + "loss": 0.5189, + "num_tokens": 54014073.0, + "step": 488 + }, + { + "epoch": 3.5977859778597785, + "grad_norm": 0.5872641163913606, + "learning_rate": 7.649507767180233e-06, + "loss": 0.5334, + "num_tokens": 54125732.0, + "step": 489 + }, + { + "epoch": 3.6051660516605164, + "grad_norm": 0.5706512850075195, + "learning_rate": 7.64001337362098e-06, + "loss": 0.547, + "num_tokens": 54245332.0, + "step": 490 + }, + { + "epoch": 3.6125461254612548, + "grad_norm": 0.6762724541633397, + "learning_rate": 7.630506653648257e-06, + "loss": 0.466, + "num_tokens": 54342768.0, + "step": 491 + }, + { + "epoch": 3.6199261992619927, + "grad_norm": 0.7062756340518096, + "learning_rate": 7.620987662020495e-06, + "loss": 0.5768, + "num_tokens": 54428990.0, + "step": 492 + }, + { + "epoch": 3.6273062730627306, + "grad_norm": 0.7303709118580136, + "learning_rate": 7.611456453566799e-06, + "loss": 0.613, + "num_tokens": 54513201.0, + "step": 493 + }, + { + "epoch": 3.6346863468634685, + "grad_norm": 0.7786447083255501, + "learning_rate": 7.601913083186648e-06, + "loss": 0.5683, + "num_tokens": 54602874.0, + "step": 494 + }, + { + "epoch": 3.6420664206642064, + "grad_norm": 0.6574885010125108, + "learning_rate": 7.592357605849572e-06, + "loss": 0.5448, + "num_tokens": 54688807.0, + "step": 495 + }, + { + "epoch": 3.6494464944649447, + "grad_norm": 0.8591301030957893, + "learning_rate": 7.582790076594836e-06, + "loss": 0.6044, + "num_tokens": 54772709.0, + "step": 496 + }, + { + "epoch": 3.6568265682656826, + "grad_norm": 0.6372147818190681, + "learning_rate": 7.573210550531126e-06, + "loss": 0.5889, + "num_tokens": 54904581.0, + "step": 497 + }, + { + "epoch": 3.6642066420664205, + "grad_norm": 0.5816280072454586, + "learning_rate": 7.563619082836225e-06, + "loss": 0.5659, + "num_tokens": 55055097.0, + "step": 498 + }, + { + "epoch": 3.671586715867159, + "grad_norm": 0.7373811135417339, + "learning_rate": 7.554015728756705e-06, + "loss": 0.5186, + "num_tokens": 55144312.0, + "step": 499 + }, + { + "epoch": 3.678966789667897, + "grad_norm": 0.6664043352066058, + "learning_rate": 7.544400543607599e-06, + "loss": 0.568, + "num_tokens": 55242571.0, + "step": 500 + }, + { + "epoch": 3.6863468634686347, + "grad_norm": 0.6349135676386292, + "learning_rate": 7.534773582772087e-06, + "loss": 0.5322, + "num_tokens": 55334546.0, + "step": 501 + }, + { + "epoch": 3.6937269372693726, + "grad_norm": 0.6862750480162775, + "learning_rate": 7.525134901701178e-06, + "loss": 0.5333, + "num_tokens": 55437983.0, + "step": 502 + }, + { + "epoch": 3.7011070110701105, + "grad_norm": 0.6348808197020509, + "learning_rate": 7.515484555913388e-06, + "loss": 0.5853, + "num_tokens": 55555490.0, + "step": 503 + }, + { + "epoch": 3.708487084870849, + "grad_norm": 0.6129333693971603, + "learning_rate": 7.5058226009944235e-06, + "loss": 0.4703, + "num_tokens": 55667192.0, + "step": 504 + }, + { + "epoch": 3.715867158671587, + "grad_norm": 0.6151258674669042, + "learning_rate": 7.496149092596856e-06, + "loss": 0.6096, + "num_tokens": 55780741.0, + "step": 505 + }, + { + "epoch": 3.7232472324723247, + "grad_norm": 0.6083067464728237, + "learning_rate": 7.48646408643981e-06, + "loss": 0.496, + "num_tokens": 55913521.0, + "step": 506 + }, + { + "epoch": 3.7306273062730626, + "grad_norm": 0.6108105406176739, + "learning_rate": 7.476767638308628e-06, + "loss": 0.4575, + "num_tokens": 56042599.0, + "step": 507 + }, + { + "epoch": 3.7380073800738005, + "grad_norm": 0.6366911234823489, + "learning_rate": 7.467059804054567e-06, + "loss": 0.9373, + "num_tokens": 56174558.0, + "step": 508 + }, + { + "epoch": 3.745387453874539, + "grad_norm": 0.5652847911595245, + "learning_rate": 7.457340639594463e-06, + "loss": 0.4751, + "num_tokens": 56297335.0, + "step": 509 + }, + { + "epoch": 3.7527675276752768, + "grad_norm": 0.6336279282262137, + "learning_rate": 7.447610200910417e-06, + "loss": 0.5882, + "num_tokens": 56417100.0, + "step": 510 + }, + { + "epoch": 3.7601476014760147, + "grad_norm": 0.6565595172181139, + "learning_rate": 7.437868544049464e-06, + "loss": 0.5453, + "num_tokens": 56513620.0, + "step": 511 + }, + { + "epoch": 3.767527675276753, + "grad_norm": 0.6883459474029607, + "learning_rate": 7.428115725123256e-06, + "loss": 0.4716, + "num_tokens": 56628754.0, + "step": 512 + }, + { + "epoch": 3.774907749077491, + "grad_norm": 0.5859063630882125, + "learning_rate": 7.4183518003077445e-06, + "loss": 0.5763, + "num_tokens": 56720718.0, + "step": 513 + }, + { + "epoch": 3.782287822878229, + "grad_norm": 0.6638307638429727, + "learning_rate": 7.408576825842845e-06, + "loss": 0.5437, + "num_tokens": 56813942.0, + "step": 514 + }, + { + "epoch": 3.7896678966789668, + "grad_norm": 0.650844785515401, + "learning_rate": 7.39879085803212e-06, + "loss": 0.5295, + "num_tokens": 56933381.0, + "step": 515 + }, + { + "epoch": 3.7970479704797047, + "grad_norm": 0.6960337012516411, + "learning_rate": 7.388993953242453e-06, + "loss": 0.5942, + "num_tokens": 57031898.0, + "step": 516 + }, + { + "epoch": 3.804428044280443, + "grad_norm": 0.6013857034775665, + "learning_rate": 7.379186167903726e-06, + "loss": 0.8671, + "num_tokens": 57159544.0, + "step": 517 + }, + { + "epoch": 3.811808118081181, + "grad_norm": 0.6324109984635025, + "learning_rate": 7.36936755850849e-06, + "loss": 0.5234, + "num_tokens": 57262915.0, + "step": 518 + }, + { + "epoch": 3.819188191881919, + "grad_norm": 0.6543810076842617, + "learning_rate": 7.359538181611643e-06, + "loss": 0.554, + "num_tokens": 57369864.0, + "step": 519 + }, + { + "epoch": 3.8265682656826567, + "grad_norm": 0.6516252107608712, + "learning_rate": 7.349698093830106e-06, + "loss": 0.5926, + "num_tokens": 57474391.0, + "step": 520 + }, + { + "epoch": 3.8339483394833946, + "grad_norm": 0.6137494004803659, + "learning_rate": 7.3398473518424886e-06, + "loss": 0.5325, + "num_tokens": 57578201.0, + "step": 521 + }, + { + "epoch": 3.841328413284133, + "grad_norm": 0.6303464613399665, + "learning_rate": 7.329986012388775e-06, + "loss": 0.5099, + "num_tokens": 57697852.0, + "step": 522 + }, + { + "epoch": 3.848708487084871, + "grad_norm": 0.568654398023787, + "learning_rate": 7.320114132269988e-06, + "loss": 0.5581, + "num_tokens": 57835634.0, + "step": 523 + }, + { + "epoch": 3.856088560885609, + "grad_norm": 0.5934669338639725, + "learning_rate": 7.310231768347862e-06, + "loss": 0.5822, + "num_tokens": 57909013.0, + "step": 524 + }, + { + "epoch": 3.8634686346863467, + "grad_norm": 0.7395411253378467, + "learning_rate": 7.30033897754452e-06, + "loss": 0.5455, + "num_tokens": 58030797.0, + "step": 525 + }, + { + "epoch": 3.8708487084870846, + "grad_norm": 0.6244514128434226, + "learning_rate": 7.290435816842144e-06, + "loss": 0.577, + "num_tokens": 58126913.0, + "step": 526 + }, + { + "epoch": 3.878228782287823, + "grad_norm": 0.6728134092311296, + "learning_rate": 7.280522343282647e-06, + "loss": 0.516, + "num_tokens": 58236344.0, + "step": 527 + }, + { + "epoch": 3.885608856088561, + "grad_norm": 0.6290426751390251, + "learning_rate": 7.270598613967339e-06, + "loss": 0.4682, + "num_tokens": 58353595.0, + "step": 528 + }, + { + "epoch": 3.892988929889299, + "grad_norm": 0.6628163607802886, + "learning_rate": 7.260664686056606e-06, + "loss": 0.5101, + "num_tokens": 58496620.0, + "step": 529 + }, + { + "epoch": 3.900369003690037, + "grad_norm": 0.6254928081028875, + "learning_rate": 7.250720616769581e-06, + "loss": 0.5779, + "num_tokens": 58582092.0, + "step": 530 + }, + { + "epoch": 3.907749077490775, + "grad_norm": 0.6720141890936548, + "learning_rate": 7.2407664633838035e-06, + "loss": 0.4887, + "num_tokens": 58695778.0, + "step": 531 + }, + { + "epoch": 3.915129151291513, + "grad_norm": 0.6378774459005809, + "learning_rate": 7.230802283234905e-06, + "loss": 0.5153, + "num_tokens": 58809203.0, + "step": 532 + }, + { + "epoch": 3.922509225092251, + "grad_norm": 0.5348202191069997, + "learning_rate": 7.220828133716268e-06, + "loss": 0.5236, + "num_tokens": 58933023.0, + "step": 533 + }, + { + "epoch": 3.9298892988929888, + "grad_norm": 0.6866864951443549, + "learning_rate": 7.210844072278694e-06, + "loss": 0.5422, + "num_tokens": 59030016.0, + "step": 534 + }, + { + "epoch": 3.937269372693727, + "grad_norm": 0.6456241271796205, + "learning_rate": 7.20085015643008e-06, + "loss": 0.5258, + "num_tokens": 59140666.0, + "step": 535 + }, + { + "epoch": 3.944649446494465, + "grad_norm": 0.5954729482509249, + "learning_rate": 7.190846443735088e-06, + "loss": 0.546, + "num_tokens": 59282134.0, + "step": 536 + }, + { + "epoch": 3.952029520295203, + "grad_norm": 0.5907445946022006, + "learning_rate": 7.180832991814802e-06, + "loss": 0.4798, + "num_tokens": 59378299.0, + "step": 537 + }, + { + "epoch": 3.959409594095941, + "grad_norm": 0.6770695720932642, + "learning_rate": 7.170809858346413e-06, + "loss": 0.579, + "num_tokens": 59504160.0, + "step": 538 + }, + { + "epoch": 3.9667896678966788, + "grad_norm": 0.5845919920047277, + "learning_rate": 7.160777101062866e-06, + "loss": 0.496, + "num_tokens": 59610918.0, + "step": 539 + }, + { + "epoch": 3.974169741697417, + "grad_norm": 0.6342011369220734, + "learning_rate": 7.150734777752547e-06, + "loss": 0.6157, + "num_tokens": 59710685.0, + "step": 540 + }, + { + "epoch": 3.981549815498155, + "grad_norm": 0.7509567878858718, + "learning_rate": 7.140682946258942e-06, + "loss": 0.5196, + "num_tokens": 59800753.0, + "step": 541 + }, + { + "epoch": 3.988929889298893, + "grad_norm": 0.617408380954175, + "learning_rate": 7.130621664480301e-06, + "loss": 0.5888, + "num_tokens": 59929772.0, + "step": 542 + }, + { + "epoch": 3.9963099630996313, + "grad_norm": 0.616688560156464, + "learning_rate": 7.1205509903693084e-06, + "loss": 0.6072, + "num_tokens": 60027785.0, + "step": 543 + }, + { + "epoch": 4.0, + "grad_norm": 1.077119931873448, + "learning_rate": 7.1104709819327455e-06, + "loss": 0.4023, + "num_tokens": 60072523.0, + "step": 544 + }, + { + "epoch": 4.007380073800738, + "grad_norm": 0.8185549681071511, + "learning_rate": 7.1003816972311636e-06, + "loss": 0.3787, + "num_tokens": 60178213.0, + "step": 545 + }, + { + "epoch": 4.014760147601476, + "grad_norm": 0.6879287599827005, + "learning_rate": 7.090283194378544e-06, + "loss": 0.4793, + "num_tokens": 60343840.0, + "step": 546 + }, + { + "epoch": 4.022140221402214, + "grad_norm": 0.7246888092805986, + "learning_rate": 7.0801755315419595e-06, + "loss": 0.4101, + "num_tokens": 60439988.0, + "step": 547 + }, + { + "epoch": 4.029520295202952, + "grad_norm": 0.5455761792313781, + "learning_rate": 7.070058766941251e-06, + "loss": 0.3616, + "num_tokens": 60565853.0, + "step": 548 + }, + { + "epoch": 4.03690036900369, + "grad_norm": 0.7686756616172389, + "learning_rate": 7.05993295884868e-06, + "loss": 0.493, + "num_tokens": 60681326.0, + "step": 549 + }, + { + "epoch": 4.044280442804428, + "grad_norm": 1.042380896202571, + "learning_rate": 7.049798165588603e-06, + "loss": 0.4094, + "num_tokens": 60788082.0, + "step": 550 + }, + { + "epoch": 4.051660516605166, + "grad_norm": 1.0128125858080455, + "learning_rate": 7.039654445537126e-06, + "loss": 0.421, + "num_tokens": 60893178.0, + "step": 551 + }, + { + "epoch": 4.059040590405904, + "grad_norm": 0.8866696577371358, + "learning_rate": 7.029501857121776e-06, + "loss": 0.3882, + "num_tokens": 61007793.0, + "step": 552 + }, + { + "epoch": 4.0664206642066425, + "grad_norm": 0.7827477640476983, + "learning_rate": 7.01934045882116e-06, + "loss": 0.3865, + "num_tokens": 61148858.0, + "step": 553 + }, + { + "epoch": 4.07380073800738, + "grad_norm": 0.7403004236589705, + "learning_rate": 7.009170309164631e-06, + "loss": 0.4022, + "num_tokens": 61256572.0, + "step": 554 + }, + { + "epoch": 4.081180811808118, + "grad_norm": 0.773274412714197, + "learning_rate": 6.9989914667319495e-06, + "loss": 0.4321, + "num_tokens": 61348777.0, + "step": 555 + }, + { + "epoch": 4.088560885608856, + "grad_norm": 0.6936826874938206, + "learning_rate": 6.988803990152944e-06, + "loss": 0.3726, + "num_tokens": 61463473.0, + "step": 556 + }, + { + "epoch": 4.095940959409594, + "grad_norm": 0.6364735122936085, + "learning_rate": 6.978607938107177e-06, + "loss": 0.3764, + "num_tokens": 61608501.0, + "step": 557 + }, + { + "epoch": 4.1033210332103325, + "grad_norm": 0.6736554158825293, + "learning_rate": 6.968403369323607e-06, + "loss": 0.3921, + "num_tokens": 61751791.0, + "step": 558 + }, + { + "epoch": 4.11070110701107, + "grad_norm": 0.6192184760350499, + "learning_rate": 6.958190342580248e-06, + "loss": 0.3915, + "num_tokens": 61887652.0, + "step": 559 + }, + { + "epoch": 4.118081180811808, + "grad_norm": 0.6154621603693199, + "learning_rate": 6.9479689167038265e-06, + "loss": 0.3364, + "num_tokens": 62008678.0, + "step": 560 + }, + { + "epoch": 4.125461254612546, + "grad_norm": 0.7049771082005446, + "learning_rate": 6.937739150569455e-06, + "loss": 0.4111, + "num_tokens": 62158409.0, + "step": 561 + }, + { + "epoch": 4.132841328413284, + "grad_norm": 0.7181167850144017, + "learning_rate": 6.927501103100284e-06, + "loss": 0.3484, + "num_tokens": 62271321.0, + "step": 562 + }, + { + "epoch": 4.1402214022140225, + "grad_norm": 0.794654931511997, + "learning_rate": 6.91725483326716e-06, + "loss": 0.3911, + "num_tokens": 62356715.0, + "step": 563 + }, + { + "epoch": 4.14760147601476, + "grad_norm": 0.7907185130050475, + "learning_rate": 6.907000400088293e-06, + "loss": 0.3621, + "num_tokens": 62445912.0, + "step": 564 + }, + { + "epoch": 4.154981549815498, + "grad_norm": 0.7203231878606359, + "learning_rate": 6.896737862628914e-06, + "loss": 0.3761, + "num_tokens": 62552929.0, + "step": 565 + }, + { + "epoch": 4.162361623616236, + "grad_norm": 0.7850729901068995, + "learning_rate": 6.886467280000935e-06, + "loss": 0.4343, + "num_tokens": 62695274.0, + "step": 566 + }, + { + "epoch": 4.169741697416974, + "grad_norm": 0.7268275265837512, + "learning_rate": 6.876188711362604e-06, + "loss": 0.3882, + "num_tokens": 62820912.0, + "step": 567 + }, + { + "epoch": 4.177121771217712, + "grad_norm": 0.687720731041873, + "learning_rate": 6.865902215918175e-06, + "loss": 0.4819, + "num_tokens": 62936836.0, + "step": 568 + }, + { + "epoch": 4.18450184501845, + "grad_norm": 0.6633965124703752, + "learning_rate": 6.855607852917555e-06, + "loss": 0.3969, + "num_tokens": 63069777.0, + "step": 569 + }, + { + "epoch": 4.191881918819188, + "grad_norm": 0.7588942643609261, + "learning_rate": 6.845305681655967e-06, + "loss": 0.403, + "num_tokens": 63159681.0, + "step": 570 + }, + { + "epoch": 4.199261992619927, + "grad_norm": 0.6347494522478722, + "learning_rate": 6.834995761473614e-06, + "loss": 0.4054, + "num_tokens": 63279217.0, + "step": 571 + }, + { + "epoch": 4.206642066420664, + "grad_norm": 0.6614862018835697, + "learning_rate": 6.824678151755328e-06, + "loss": 0.4541, + "num_tokens": 63417206.0, + "step": 572 + }, + { + "epoch": 4.214022140221402, + "grad_norm": 0.6903189153624362, + "learning_rate": 6.814352911930236e-06, + "loss": 0.4138, + "num_tokens": 63537780.0, + "step": 573 + }, + { + "epoch": 4.22140221402214, + "grad_norm": 0.6693782968512514, + "learning_rate": 6.8040201014714115e-06, + "loss": 0.3367, + "num_tokens": 63638489.0, + "step": 574 + }, + { + "epoch": 4.228782287822878, + "grad_norm": 0.752241187045228, + "learning_rate": 6.793679779895538e-06, + "loss": 0.4096, + "num_tokens": 63741860.0, + "step": 575 + }, + { + "epoch": 4.236162361623617, + "grad_norm": 0.7282925415266427, + "learning_rate": 6.783332006762556e-06, + "loss": 0.9501, + "num_tokens": 63870304.0, + "step": 576 + }, + { + "epoch": 4.243542435424354, + "grad_norm": 0.8245230569269095, + "learning_rate": 6.772976841675337e-06, + "loss": 0.4047, + "num_tokens": 63958107.0, + "step": 577 + }, + { + "epoch": 4.250922509225092, + "grad_norm": 0.9844307444354672, + "learning_rate": 6.76261434427932e-06, + "loss": 0.4605, + "num_tokens": 64070823.0, + "step": 578 + }, + { + "epoch": 4.25830258302583, + "grad_norm": 0.7565155101273008, + "learning_rate": 6.752244574262186e-06, + "loss": 0.4045, + "num_tokens": 64164408.0, + "step": 579 + }, + { + "epoch": 4.265682656826568, + "grad_norm": 0.7759330833778824, + "learning_rate": 6.741867591353498e-06, + "loss": 0.4147, + "num_tokens": 64273354.0, + "step": 580 + }, + { + "epoch": 4.273062730627307, + "grad_norm": 0.6130687740196629, + "learning_rate": 6.731483455324374e-06, + "loss": 0.3457, + "num_tokens": 64391822.0, + "step": 581 + }, + { + "epoch": 4.280442804428044, + "grad_norm": 0.6339762466431851, + "learning_rate": 6.7210922259871245e-06, + "loss": 0.3936, + "num_tokens": 64514968.0, + "step": 582 + }, + { + "epoch": 4.287822878228782, + "grad_norm": 0.6391658581598267, + "learning_rate": 6.710693963194925e-06, + "loss": 0.3848, + "num_tokens": 64631212.0, + "step": 583 + }, + { + "epoch": 4.29520295202952, + "grad_norm": 0.7358101426442469, + "learning_rate": 6.7002887268414595e-06, + "loss": 0.3717, + "num_tokens": 64730478.0, + "step": 584 + }, + { + "epoch": 4.302583025830258, + "grad_norm": 0.8027379283621318, + "learning_rate": 6.68987657686058e-06, + "loss": 0.4161, + "num_tokens": 64830731.0, + "step": 585 + }, + { + "epoch": 4.3099630996309966, + "grad_norm": 0.6477689552553441, + "learning_rate": 6.679457573225961e-06, + "loss": 0.3685, + "num_tokens": 64942597.0, + "step": 586 + }, + { + "epoch": 4.317343173431734, + "grad_norm": 0.7229661768785778, + "learning_rate": 6.669031775950754e-06, + "loss": 0.3956, + "num_tokens": 65049820.0, + "step": 587 + }, + { + "epoch": 4.324723247232472, + "grad_norm": 0.7471828751844891, + "learning_rate": 6.658599245087242e-06, + "loss": 0.3206, + "num_tokens": 65133098.0, + "step": 588 + }, + { + "epoch": 4.332103321033211, + "grad_norm": 0.7548872209570942, + "learning_rate": 6.6481600407264926e-06, + "loss": 0.4157, + "num_tokens": 65240629.0, + "step": 589 + }, + { + "epoch": 4.339483394833948, + "grad_norm": 0.8223112816350289, + "learning_rate": 6.637714222998013e-06, + "loss": 0.4239, + "num_tokens": 65325662.0, + "step": 590 + }, + { + "epoch": 4.3468634686346865, + "grad_norm": 0.6679953036237622, + "learning_rate": 6.627261852069402e-06, + "loss": 0.3428, + "num_tokens": 65428326.0, + "step": 591 + }, + { + "epoch": 4.354243542435424, + "grad_norm": 0.7583322535402786, + "learning_rate": 6.616802988146008e-06, + "loss": 0.3832, + "num_tokens": 65538706.0, + "step": 592 + }, + { + "epoch": 4.361623616236162, + "grad_norm": 0.673827305891719, + "learning_rate": 6.606337691470575e-06, + "loss": 0.398, + "num_tokens": 65684900.0, + "step": 593 + }, + { + "epoch": 4.369003690036901, + "grad_norm": 0.6675827806343153, + "learning_rate": 6.595866022322901e-06, + "loss": 0.3812, + "num_tokens": 65781453.0, + "step": 594 + }, + { + "epoch": 4.376383763837638, + "grad_norm": 0.6942316723004445, + "learning_rate": 6.585388041019488e-06, + "loss": 0.4026, + "num_tokens": 65892895.0, + "step": 595 + }, + { + "epoch": 4.3837638376383765, + "grad_norm": 0.6709103394389789, + "learning_rate": 6.574903807913201e-06, + "loss": 0.3742, + "num_tokens": 66008234.0, + "step": 596 + }, + { + "epoch": 4.391143911439114, + "grad_norm": 0.6905786327427121, + "learning_rate": 6.5644133833929065e-06, + "loss": 0.3655, + "num_tokens": 66109214.0, + "step": 597 + }, + { + "epoch": 4.398523985239852, + "grad_norm": 0.7557820237247774, + "learning_rate": 6.553916827883142e-06, + "loss": 0.47, + "num_tokens": 66217645.0, + "step": 598 + }, + { + "epoch": 4.405904059040591, + "grad_norm": 0.7156665158820491, + "learning_rate": 6.543414201843753e-06, + "loss": 0.4587, + "num_tokens": 66351675.0, + "step": 599 + }, + { + "epoch": 4.413284132841328, + "grad_norm": 0.7314287717700599, + "learning_rate": 6.532905565769556e-06, + "loss": 0.3386, + "num_tokens": 66442708.0, + "step": 600 + }, + { + "epoch": 4.4206642066420665, + "grad_norm": 0.7399482793307474, + "learning_rate": 6.52239098018998e-06, + "loss": 0.4569, + "num_tokens": 66560923.0, + "step": 601 + }, + { + "epoch": 4.428044280442805, + "grad_norm": 0.634745317658342, + "learning_rate": 6.511870505668726e-06, + "loss": 0.3599, + "num_tokens": 66683186.0, + "step": 602 + }, + { + "epoch": 4.435424354243542, + "grad_norm": 0.7047160432707139, + "learning_rate": 6.501344202803415e-06, + "loss": 0.362, + "num_tokens": 66798415.0, + "step": 603 + }, + { + "epoch": 4.442804428044281, + "grad_norm": 0.6775990917406843, + "learning_rate": 6.490812132225241e-06, + "loss": 0.4059, + "num_tokens": 66934969.0, + "step": 604 + }, + { + "epoch": 4.450184501845018, + "grad_norm": 0.8094929699161325, + "learning_rate": 6.480274354598615e-06, + "loss": 0.4689, + "num_tokens": 67027774.0, + "step": 605 + }, + { + "epoch": 4.4575645756457565, + "grad_norm": 0.8226454567493683, + "learning_rate": 6.469730930620824e-06, + "loss": 0.3639, + "num_tokens": 67119023.0, + "step": 606 + }, + { + "epoch": 4.464944649446495, + "grad_norm": 0.6924009621937695, + "learning_rate": 6.459181921021676e-06, + "loss": 0.3743, + "num_tokens": 67217382.0, + "step": 607 + }, + { + "epoch": 4.472324723247232, + "grad_norm": 0.7349983289292954, + "learning_rate": 6.448627386563155e-06, + "loss": 0.4336, + "num_tokens": 67318097.0, + "step": 608 + }, + { + "epoch": 4.479704797047971, + "grad_norm": 0.6794249119652119, + "learning_rate": 6.438067388039065e-06, + "loss": 0.3629, + "num_tokens": 67434604.0, + "step": 609 + }, + { + "epoch": 4.487084870848708, + "grad_norm": 0.8646818271476965, + "learning_rate": 6.427501986274684e-06, + "loss": 0.4263, + "num_tokens": 67512963.0, + "step": 610 + }, + { + "epoch": 4.4944649446494465, + "grad_norm": 0.7568121700794729, + "learning_rate": 6.41693124212641e-06, + "loss": 0.3675, + "num_tokens": 67614456.0, + "step": 611 + }, + { + "epoch": 4.501845018450185, + "grad_norm": 0.7672043222057289, + "learning_rate": 6.40635521648142e-06, + "loss": 0.3904, + "num_tokens": 67699709.0, + "step": 612 + }, + { + "epoch": 4.509225092250922, + "grad_norm": 0.7511345874881337, + "learning_rate": 6.395773970257303e-06, + "loss": 0.3599, + "num_tokens": 67804127.0, + "step": 613 + }, + { + "epoch": 4.516605166051661, + "grad_norm": 0.6867131065685464, + "learning_rate": 6.385187564401727e-06, + "loss": 0.353, + "num_tokens": 67903764.0, + "step": 614 + }, + { + "epoch": 4.523985239852399, + "grad_norm": 0.6710396335014553, + "learning_rate": 6.374596059892073e-06, + "loss": 0.6348, + "num_tokens": 68034031.0, + "step": 615 + }, + { + "epoch": 4.531365313653136, + "grad_norm": 0.8625075936358634, + "learning_rate": 6.363999517735091e-06, + "loss": 0.722, + "num_tokens": 68148463.0, + "step": 616 + }, + { + "epoch": 4.538745387453875, + "grad_norm": 0.6800388785943237, + "learning_rate": 6.353397998966551e-06, + "loss": 0.4106, + "num_tokens": 68267729.0, + "step": 617 + }, + { + "epoch": 4.546125461254612, + "grad_norm": 0.7271259266704767, + "learning_rate": 6.342791564650886e-06, + "loss": 0.4337, + "num_tokens": 68382722.0, + "step": 618 + }, + { + "epoch": 4.553505535055351, + "grad_norm": 0.7502362722709528, + "learning_rate": 6.332180275880843e-06, + "loss": 0.4248, + "num_tokens": 68475804.0, + "step": 619 + }, + { + "epoch": 4.560885608856088, + "grad_norm": 0.7333546816412613, + "learning_rate": 6.321564193777129e-06, + "loss": 0.3865, + "num_tokens": 68570574.0, + "step": 620 + }, + { + "epoch": 4.568265682656826, + "grad_norm": 0.8558187774009335, + "learning_rate": 6.310943379488061e-06, + "loss": 0.4161, + "num_tokens": 68668870.0, + "step": 621 + }, + { + "epoch": 4.575645756457565, + "grad_norm": 0.7552469581133401, + "learning_rate": 6.3003178941892165e-06, + "loss": 0.3005, + "num_tokens": 68760744.0, + "step": 622 + }, + { + "epoch": 4.583025830258302, + "grad_norm": 0.5928004896884772, + "learning_rate": 6.289687799083073e-06, + "loss": 0.3343, + "num_tokens": 68879055.0, + "step": 623 + }, + { + "epoch": 4.590405904059041, + "grad_norm": 0.7523382356997574, + "learning_rate": 6.279053155398663e-06, + "loss": 0.4567, + "num_tokens": 68972393.0, + "step": 624 + }, + { + "epoch": 4.597785977859779, + "grad_norm": 0.7371578316979396, + "learning_rate": 6.268414024391218e-06, + "loss": 0.4588, + "num_tokens": 69073371.0, + "step": 625 + }, + { + "epoch": 4.605166051660516, + "grad_norm": 0.8698095075929976, + "learning_rate": 6.2577704673418195e-06, + "loss": 0.4769, + "num_tokens": 69147682.0, + "step": 626 + }, + { + "epoch": 4.612546125461255, + "grad_norm": 0.6516647547945007, + "learning_rate": 6.247122545557036e-06, + "loss": 0.385, + "num_tokens": 69271135.0, + "step": 627 + }, + { + "epoch": 4.619926199261993, + "grad_norm": 0.7705970000583697, + "learning_rate": 6.236470320368582e-06, + "loss": 0.3982, + "num_tokens": 69379315.0, + "step": 628 + }, + { + "epoch": 4.627306273062731, + "grad_norm": 0.8054329722423857, + "learning_rate": 6.2258138531329595e-06, + "loss": 0.4099, + "num_tokens": 69471071.0, + "step": 629 + }, + { + "epoch": 4.634686346863469, + "grad_norm": 0.8582480575625172, + "learning_rate": 6.2151532052311e-06, + "loss": 0.4169, + "num_tokens": 69546670.0, + "step": 630 + }, + { + "epoch": 4.642066420664206, + "grad_norm": 0.8348601482706759, + "learning_rate": 6.204488438068021e-06, + "loss": 0.448, + "num_tokens": 69642136.0, + "step": 631 + }, + { + "epoch": 4.649446494464945, + "grad_norm": 0.6430948321517582, + "learning_rate": 6.193819613072467e-06, + "loss": 0.4438, + "num_tokens": 69832108.0, + "step": 632 + }, + { + "epoch": 4.656826568265682, + "grad_norm": 0.6425387764210215, + "learning_rate": 6.183146791696549e-06, + "loss": 0.3639, + "num_tokens": 69942356.0, + "step": 633 + }, + { + "epoch": 4.6642066420664205, + "grad_norm": 0.6934609339884044, + "learning_rate": 6.172470035415403e-06, + "loss": 0.3851, + "num_tokens": 70061692.0, + "step": 634 + }, + { + "epoch": 4.671586715867159, + "grad_norm": 0.716829048974258, + "learning_rate": 6.1617894057268276e-06, + "loss": 0.3516, + "num_tokens": 70168745.0, + "step": 635 + }, + { + "epoch": 4.678966789667896, + "grad_norm": 0.7743408601424967, + "learning_rate": 6.151104964150932e-06, + "loss": 0.4137, + "num_tokens": 70262643.0, + "step": 636 + }, + { + "epoch": 4.686346863468635, + "grad_norm": 0.6771485702327544, + "learning_rate": 6.140416772229785e-06, + "loss": 0.4256, + "num_tokens": 70383350.0, + "step": 637 + }, + { + "epoch": 4.693726937269373, + "grad_norm": 0.6886339783733789, + "learning_rate": 6.129724891527049e-06, + "loss": 0.4286, + "num_tokens": 70531946.0, + "step": 638 + }, + { + "epoch": 4.7011070110701105, + "grad_norm": 0.7102941718952961, + "learning_rate": 6.119029383627645e-06, + "loss": 0.3719, + "num_tokens": 70643122.0, + "step": 639 + }, + { + "epoch": 4.708487084870849, + "grad_norm": 0.6907839812903495, + "learning_rate": 6.108330310137379e-06, + "loss": 0.3986, + "num_tokens": 70741793.0, + "step": 640 + }, + { + "epoch": 4.715867158671586, + "grad_norm": 0.6689514709397577, + "learning_rate": 6.097627732682596e-06, + "loss": 0.4467, + "num_tokens": 70864577.0, + "step": 641 + }, + { + "epoch": 4.723247232472325, + "grad_norm": 0.6774349195540061, + "learning_rate": 6.086921712909824e-06, + "loss": 0.3608, + "num_tokens": 70983590.0, + "step": 642 + }, + { + "epoch": 4.730627306273063, + "grad_norm": 0.644666936201413, + "learning_rate": 6.076212312485419e-06, + "loss": 0.4958, + "num_tokens": 71099310.0, + "step": 643 + }, + { + "epoch": 4.7380073800738005, + "grad_norm": 0.6965330619998724, + "learning_rate": 6.0654995930952085e-06, + "loss": 0.4535, + "num_tokens": 71227732.0, + "step": 644 + }, + { + "epoch": 4.745387453874539, + "grad_norm": 0.7391158181982388, + "learning_rate": 6.054783616444141e-06, + "loss": 0.4149, + "num_tokens": 71344542.0, + "step": 645 + }, + { + "epoch": 4.752767527675276, + "grad_norm": 0.7092144938171325, + "learning_rate": 6.044064444255921e-06, + "loss": 0.3611, + "num_tokens": 71450018.0, + "step": 646 + }, + { + "epoch": 4.760147601476015, + "grad_norm": 0.7822866247408168, + "learning_rate": 6.033342138272663e-06, + "loss": 0.3855, + "num_tokens": 71532016.0, + "step": 647 + }, + { + "epoch": 4.767527675276753, + "grad_norm": 0.6949716125906905, + "learning_rate": 6.0226167602545296e-06, + "loss": 0.4147, + "num_tokens": 71644441.0, + "step": 648 + }, + { + "epoch": 4.7749077490774905, + "grad_norm": 0.7030059314502649, + "learning_rate": 6.01188837197938e-06, + "loss": 0.3824, + "num_tokens": 71745511.0, + "step": 649 + }, + { + "epoch": 4.782287822878229, + "grad_norm": 0.6864412003773923, + "learning_rate": 6.001157035242415e-06, + "loss": 0.3907, + "num_tokens": 71858712.0, + "step": 650 + }, + { + "epoch": 4.789667896678967, + "grad_norm": 0.7049625591277209, + "learning_rate": 5.9904228118558126e-06, + "loss": 0.3608, + "num_tokens": 71947709.0, + "step": 651 + }, + { + "epoch": 4.797047970479705, + "grad_norm": 0.8213761554544584, + "learning_rate": 5.979685763648381e-06, + "loss": 0.4346, + "num_tokens": 72031618.0, + "step": 652 + }, + { + "epoch": 4.804428044280443, + "grad_norm": 0.6808314414773919, + "learning_rate": 5.968945952465199e-06, + "loss": 0.3338, + "num_tokens": 72166482.0, + "step": 653 + }, + { + "epoch": 4.8118081180811805, + "grad_norm": 0.6746618582832787, + "learning_rate": 5.958203440167261e-06, + "loss": 0.4232, + "num_tokens": 72291372.0, + "step": 654 + }, + { + "epoch": 4.819188191881919, + "grad_norm": 0.8480120297387668, + "learning_rate": 5.947458288631117e-06, + "loss": 0.4583, + "num_tokens": 72399830.0, + "step": 655 + }, + { + "epoch": 4.826568265682657, + "grad_norm": 0.7251109084368389, + "learning_rate": 5.936710559748521e-06, + "loss": 0.4565, + "num_tokens": 72503354.0, + "step": 656 + }, + { + "epoch": 4.833948339483395, + "grad_norm": 0.7006893405079634, + "learning_rate": 5.925960315426072e-06, + "loss": 0.4161, + "num_tokens": 72624460.0, + "step": 657 + }, + { + "epoch": 4.841328413284133, + "grad_norm": 0.6880629191287833, + "learning_rate": 5.915207617584859e-06, + "loss": 0.3444, + "num_tokens": 72741862.0, + "step": 658 + }, + { + "epoch": 4.8487084870848705, + "grad_norm": 0.6383251279558573, + "learning_rate": 5.904452528160104e-06, + "loss": 0.4148, + "num_tokens": 72883296.0, + "step": 659 + }, + { + "epoch": 4.856088560885609, + "grad_norm": 0.7516182787280705, + "learning_rate": 5.893695109100798e-06, + "loss": 0.3921, + "num_tokens": 72994318.0, + "step": 660 + }, + { + "epoch": 4.863468634686347, + "grad_norm": 0.7662024035456283, + "learning_rate": 5.882935422369359e-06, + "loss": 0.4267, + "num_tokens": 73087117.0, + "step": 661 + }, + { + "epoch": 4.870848708487085, + "grad_norm": 0.8083001695801738, + "learning_rate": 5.872173529941261e-06, + "loss": 0.4142, + "num_tokens": 73192480.0, + "step": 662 + }, + { + "epoch": 4.878228782287823, + "grad_norm": 0.699125650675328, + "learning_rate": 5.861409493804686e-06, + "loss": 0.4136, + "num_tokens": 73297176.0, + "step": 663 + }, + { + "epoch": 4.885608856088561, + "grad_norm": 0.6535158624821691, + "learning_rate": 5.850643375960161e-06, + "loss": 0.387, + "num_tokens": 73430941.0, + "step": 664 + }, + { + "epoch": 4.892988929889299, + "grad_norm": 0.6861981476949671, + "learning_rate": 5.839875238420206e-06, + "loss": 0.4156, + "num_tokens": 73555649.0, + "step": 665 + }, + { + "epoch": 4.900369003690037, + "grad_norm": 0.7059838206641451, + "learning_rate": 5.829105143208973e-06, + "loss": 0.3595, + "num_tokens": 73649936.0, + "step": 666 + }, + { + "epoch": 4.907749077490775, + "grad_norm": 0.7876415348592959, + "learning_rate": 5.818333152361891e-06, + "loss": 0.4678, + "num_tokens": 73769910.0, + "step": 667 + }, + { + "epoch": 4.915129151291513, + "grad_norm": 0.7843312715366683, + "learning_rate": 5.807559327925307e-06, + "loss": 0.3994, + "num_tokens": 73870043.0, + "step": 668 + }, + { + "epoch": 4.922509225092251, + "grad_norm": 0.8266922375686809, + "learning_rate": 5.79678373195613e-06, + "loss": 0.4432, + "num_tokens": 73968723.0, + "step": 669 + }, + { + "epoch": 4.929889298892989, + "grad_norm": 0.7553323604194044, + "learning_rate": 5.786006426521473e-06, + "loss": 0.3661, + "num_tokens": 74074339.0, + "step": 670 + }, + { + "epoch": 4.937269372693727, + "grad_norm": 0.8281643428151172, + "learning_rate": 5.775227473698294e-06, + "loss": 0.4633, + "num_tokens": 74184933.0, + "step": 671 + }, + { + "epoch": 4.944649446494465, + "grad_norm": 0.8028164536285961, + "learning_rate": 5.7644469355730414e-06, + "loss": 0.4214, + "num_tokens": 74274840.0, + "step": 672 + }, + { + "epoch": 4.952029520295203, + "grad_norm": 0.8092839757824163, + "learning_rate": 5.753664874241295e-06, + "loss": 0.4089, + "num_tokens": 74361517.0, + "step": 673 + }, + { + "epoch": 4.959409594095941, + "grad_norm": 0.774685434447823, + "learning_rate": 5.7428813518074065e-06, + "loss": 0.3799, + "num_tokens": 74446520.0, + "step": 674 + }, + { + "epoch": 4.966789667896679, + "grad_norm": 0.643089944875341, + "learning_rate": 5.732096430384148e-06, + "loss": 0.8404, + "num_tokens": 74627275.0, + "step": 675 + }, + { + "epoch": 4.974169741697417, + "grad_norm": 0.7416708152229656, + "learning_rate": 5.7213101720923425e-06, + "loss": 0.3653, + "num_tokens": 74740803.0, + "step": 676 + }, + { + "epoch": 4.9815498154981555, + "grad_norm": 0.7906004267986075, + "learning_rate": 5.710522639060521e-06, + "loss": 0.4038, + "num_tokens": 74831156.0, + "step": 677 + }, + { + "epoch": 4.988929889298893, + "grad_norm": 0.8357493566883484, + "learning_rate": 5.6997338934245505e-06, + "loss": 0.4747, + "num_tokens": 74945585.0, + "step": 678 + }, + { + "epoch": 4.996309963099631, + "grad_norm": 0.738553121534108, + "learning_rate": 5.6889439973272886e-06, + "loss": 0.371, + "num_tokens": 75043311.0, + "step": 679 + }, + { + "epoch": 5.0, + "grad_norm": 0.738553121534108, + "learning_rate": 5.678153012918214e-06, + "loss": 0.3903, + "num_tokens": 75087882.0, + "step": 680 + }, + { + "epoch": 5.007380073800738, + "grad_norm": 1.231554549265975, + "learning_rate": 5.667361002353077e-06, + "loss": 0.3047, + "num_tokens": 75171142.0, + "step": 681 + }, + { + "epoch": 5.014760147601476, + "grad_norm": 0.8071987884892139, + "learning_rate": 5.6565680277935355e-06, + "loss": 0.3119, + "num_tokens": 75284705.0, + "step": 682 + }, + { + "epoch": 5.022140221402214, + "grad_norm": 0.7519543262012047, + "learning_rate": 5.6457741514068055e-06, + "loss": 0.3127, + "num_tokens": 75398261.0, + "step": 683 + }, + { + "epoch": 5.029520295202952, + "grad_norm": 0.7647598207944065, + "learning_rate": 5.6349794353652934e-06, + "loss": 0.3114, + "num_tokens": 75476769.0, + "step": 684 + }, + { + "epoch": 5.03690036900369, + "grad_norm": 0.7928624379185287, + "learning_rate": 5.624183941846243e-06, + "loss": 0.3665, + "num_tokens": 75608269.0, + "step": 685 + }, + { + "epoch": 5.044280442804428, + "grad_norm": 0.838843344533878, + "learning_rate": 5.6133877330313756e-06, + "loss": 0.2534, + "num_tokens": 75724961.0, + "step": 686 + }, + { + "epoch": 5.051660516605166, + "grad_norm": 1.0923805020857955, + "learning_rate": 5.6025908711065355e-06, + "loss": 0.2563, + "num_tokens": 75808372.0, + "step": 687 + }, + { + "epoch": 5.059040590405904, + "grad_norm": 0.8211586325579034, + "learning_rate": 5.591793418261326e-06, + "loss": 0.2444, + "num_tokens": 75901148.0, + "step": 688 + }, + { + "epoch": 5.0664206642066425, + "grad_norm": 0.9794952627885642, + "learning_rate": 5.580995436688752e-06, + "loss": 0.2869, + "num_tokens": 76023798.0, + "step": 689 + }, + { + "epoch": 5.07380073800738, + "grad_norm": 0.7968572551605058, + "learning_rate": 5.570196988584867e-06, + "loss": 0.246, + "num_tokens": 76127369.0, + "step": 690 + }, + { + "epoch": 5.081180811808118, + "grad_norm": 0.8498337469106462, + "learning_rate": 5.559398136148416e-06, + "loss": 0.3446, + "num_tokens": 76212514.0, + "step": 691 + }, + { + "epoch": 5.088560885608856, + "grad_norm": 0.8863003457661772, + "learning_rate": 5.548598941580464e-06, + "loss": 0.3075, + "num_tokens": 76315534.0, + "step": 692 + }, + { + "epoch": 5.095940959409594, + "grad_norm": 0.6821618701141282, + "learning_rate": 5.537799467084051e-06, + "loss": 0.7477, + "num_tokens": 76432951.0, + "step": 693 + }, + { + "epoch": 5.1033210332103325, + "grad_norm": 0.7758856845828853, + "learning_rate": 5.526999774863831e-06, + "loss": 0.3442, + "num_tokens": 76575911.0, + "step": 694 + }, + { + "epoch": 5.11070110701107, + "grad_norm": 0.864538306122709, + "learning_rate": 5.516199927125711e-06, + "loss": 0.326, + "num_tokens": 76685085.0, + "step": 695 + }, + { + "epoch": 5.118081180811808, + "grad_norm": 0.6841879390249027, + "learning_rate": 5.505399986076491e-06, + "loss": 0.3219, + "num_tokens": 76821158.0, + "step": 696 + }, + { + "epoch": 5.125461254612546, + "grad_norm": 0.763740855553643, + "learning_rate": 5.49460001392351e-06, + "loss": 0.341, + "num_tokens": 76930269.0, + "step": 697 + }, + { + "epoch": 5.132841328413284, + "grad_norm": 0.7570755487804708, + "learning_rate": 5.48380007287429e-06, + "loss": 0.2881, + "num_tokens": 77056550.0, + "step": 698 + }, + { + "epoch": 5.1402214022140225, + "grad_norm": 0.8240156667031876, + "learning_rate": 5.47300022513617e-06, + "loss": 0.3072, + "num_tokens": 77145852.0, + "step": 699 + }, + { + "epoch": 5.14760147601476, + "grad_norm": 0.784008083171791, + "learning_rate": 5.462200532915951e-06, + "loss": 0.3501, + "num_tokens": 77308212.0, + "step": 700 + }, + { + "epoch": 5.154981549815498, + "grad_norm": 0.7108287634133911, + "learning_rate": 5.451401058419537e-06, + "loss": 0.2905, + "num_tokens": 77423799.0, + "step": 701 + }, + { + "epoch": 5.162361623616236, + "grad_norm": 0.7883814815017057, + "learning_rate": 5.4406018638515855e-06, + "loss": 0.3394, + "num_tokens": 77546225.0, + "step": 702 + }, + { + "epoch": 5.169741697416974, + "grad_norm": 0.8230808734474867, + "learning_rate": 5.4298030114151335e-06, + "loss": 0.2824, + "num_tokens": 77660108.0, + "step": 703 + }, + { + "epoch": 5.177121771217712, + "grad_norm": 0.7732749264696992, + "learning_rate": 5.4190045633112506e-06, + "loss": 0.2962, + "num_tokens": 77779396.0, + "step": 704 + }, + { + "epoch": 5.18450184501845, + "grad_norm": 0.7020821068054012, + "learning_rate": 5.408206581738677e-06, + "loss": 0.33, + "num_tokens": 77891837.0, + "step": 705 + }, + { + "epoch": 5.191881918819188, + "grad_norm": 0.7757566830553919, + "learning_rate": 5.397409128893465e-06, + "loss": 0.2819, + "num_tokens": 77994807.0, + "step": 706 + }, + { + "epoch": 5.199261992619927, + "grad_norm": 0.7471023211265383, + "learning_rate": 5.386612266968625e-06, + "loss": 0.2987, + "num_tokens": 78101325.0, + "step": 707 + }, + { + "epoch": 5.206642066420664, + "grad_norm": 0.7249487534138149, + "learning_rate": 5.375816058153759e-06, + "loss": 0.3243, + "num_tokens": 78222337.0, + "step": 708 + }, + { + "epoch": 5.214022140221402, + "grad_norm": 0.6787659618795301, + "learning_rate": 5.365020564634709e-06, + "loss": 0.3288, + "num_tokens": 78353270.0, + "step": 709 + }, + { + "epoch": 5.22140221402214, + "grad_norm": 0.8629790416290948, + "learning_rate": 5.354225848593197e-06, + "loss": 0.2714, + "num_tokens": 78494800.0, + "step": 710 + }, + { + "epoch": 5.228782287822878, + "grad_norm": 0.6892763943645038, + "learning_rate": 5.343431972206467e-06, + "loss": 0.3069, + "num_tokens": 78649799.0, + "step": 711 + }, + { + "epoch": 5.236162361623617, + "grad_norm": 0.6629490118039914, + "learning_rate": 5.332638997646928e-06, + "loss": 0.3256, + "num_tokens": 78753962.0, + "step": 712 + }, + { + "epoch": 5.243542435424354, + "grad_norm": 0.8394640967851109, + "learning_rate": 5.321846987081789e-06, + "loss": 0.3206, + "num_tokens": 78867337.0, + "step": 713 + }, + { + "epoch": 5.250922509225092, + "grad_norm": 0.8567789895287279, + "learning_rate": 5.311056002672712e-06, + "loss": 0.289, + "num_tokens": 78941648.0, + "step": 714 + }, + { + "epoch": 5.25830258302583, + "grad_norm": 0.828870401236965, + "learning_rate": 5.300266106575449e-06, + "loss": 0.258, + "num_tokens": 79044888.0, + "step": 715 + }, + { + "epoch": 5.265682656826568, + "grad_norm": 0.7217389292988418, + "learning_rate": 5.28947736093948e-06, + "loss": 0.2869, + "num_tokens": 79161450.0, + "step": 716 + }, + { + "epoch": 5.273062730627307, + "grad_norm": 0.8522635118245224, + "learning_rate": 5.278689827907658e-06, + "loss": 0.2746, + "num_tokens": 79245810.0, + "step": 717 + }, + { + "epoch": 5.280442804428044, + "grad_norm": 0.7727771460862353, + "learning_rate": 5.2679035696158545e-06, + "loss": 0.2674, + "num_tokens": 79363746.0, + "step": 718 + }, + { + "epoch": 5.287822878228782, + "grad_norm": 0.8199060030313494, + "learning_rate": 5.257118648192595e-06, + "loss": 0.2579, + "num_tokens": 79473017.0, + "step": 719 + }, + { + "epoch": 5.29520295202952, + "grad_norm": 0.6251600014266541, + "learning_rate": 5.246335125758708e-06, + "loss": 0.2522, + "num_tokens": 79579974.0, + "step": 720 + }, + { + "epoch": 5.302583025830258, + "grad_norm": 0.7856376981624948, + "learning_rate": 5.235553064426962e-06, + "loss": 0.3232, + "num_tokens": 79703574.0, + "step": 721 + }, + { + "epoch": 5.3099630996309966, + "grad_norm": 0.9035725837038168, + "learning_rate": 5.224772526301709e-06, + "loss": 0.2593, + "num_tokens": 79783101.0, + "step": 722 + }, + { + "epoch": 5.317343173431734, + "grad_norm": 0.7035504175759855, + "learning_rate": 5.2139935734785286e-06, + "loss": 0.2619, + "num_tokens": 79919150.0, + "step": 723 + }, + { + "epoch": 5.324723247232472, + "grad_norm": 0.6941951970579242, + "learning_rate": 5.203216268043871e-06, + "loss": 0.2558, + "num_tokens": 80013775.0, + "step": 724 + }, + { + "epoch": 5.332103321033211, + "grad_norm": 0.7962220136583572, + "learning_rate": 5.1924406720746945e-06, + "loss": 0.2883, + "num_tokens": 80096300.0, + "step": 725 + }, + { + "epoch": 5.339483394833948, + "grad_norm": 0.7074533354205865, + "learning_rate": 5.18166684763811e-06, + "loss": 0.2714, + "num_tokens": 80222399.0, + "step": 726 + }, + { + "epoch": 5.3468634686346865, + "grad_norm": 0.8425504298670053, + "learning_rate": 5.170894856791029e-06, + "loss": 0.2963, + "num_tokens": 80303214.0, + "step": 727 + }, + { + "epoch": 5.354243542435424, + "grad_norm": 0.8349389571836422, + "learning_rate": 5.160124761579795e-06, + "loss": 0.2792, + "num_tokens": 80403735.0, + "step": 728 + }, + { + "epoch": 5.361623616236162, + "grad_norm": 0.8123470866862423, + "learning_rate": 5.149356624039841e-06, + "loss": 0.3166, + "num_tokens": 80502906.0, + "step": 729 + }, + { + "epoch": 5.369003690036901, + "grad_norm": 0.767130869184814, + "learning_rate": 5.138590506195317e-06, + "loss": 0.2721, + "num_tokens": 80599662.0, + "step": 730 + }, + { + "epoch": 5.376383763837638, + "grad_norm": 0.8751480885508618, + "learning_rate": 5.1278264700587425e-06, + "loss": 0.2788, + "num_tokens": 80696350.0, + "step": 731 + }, + { + "epoch": 5.3837638376383765, + "grad_norm": 0.6445966908177907, + "learning_rate": 5.1170645776306425e-06, + "loss": 0.3166, + "num_tokens": 80812184.0, + "step": 732 + }, + { + "epoch": 5.391143911439114, + "grad_norm": 0.7204141796556899, + "learning_rate": 5.106304890899203e-06, + "loss": 0.2975, + "num_tokens": 80936451.0, + "step": 733 + }, + { + "epoch": 5.398523985239852, + "grad_norm": 0.8073567323623798, + "learning_rate": 5.095547471839899e-06, + "loss": 0.2563, + "num_tokens": 81034128.0, + "step": 734 + }, + { + "epoch": 5.405904059040591, + "grad_norm": 0.6714427753918635, + "learning_rate": 5.084792382415142e-06, + "loss": 0.2291, + "num_tokens": 81138682.0, + "step": 735 + }, + { + "epoch": 5.413284132841328, + "grad_norm": 0.6240699238325595, + "learning_rate": 5.0740396845739305e-06, + "loss": 0.2665, + "num_tokens": 81260963.0, + "step": 736 + }, + { + "epoch": 5.4206642066420665, + "grad_norm": 0.7867412356008463, + "learning_rate": 5.063289440251481e-06, + "loss": 0.2566, + "num_tokens": 81375587.0, + "step": 737 + }, + { + "epoch": 5.428044280442805, + "grad_norm": 0.7584654734941334, + "learning_rate": 5.052541711368886e-06, + "loss": 0.2643, + "num_tokens": 81484438.0, + "step": 738 + }, + { + "epoch": 5.435424354243542, + "grad_norm": 0.7205599986317057, + "learning_rate": 5.041796559832742e-06, + "loss": 0.2732, + "num_tokens": 81596696.0, + "step": 739 + }, + { + "epoch": 5.442804428044281, + "grad_norm": 0.7574253228099226, + "learning_rate": 5.031054047534801e-06, + "loss": 0.3404, + "num_tokens": 81692711.0, + "step": 740 + }, + { + "epoch": 5.450184501845018, + "grad_norm": 0.7947677794363449, + "learning_rate": 5.02031423635162e-06, + "loss": 0.3239, + "num_tokens": 81792454.0, + "step": 741 + }, + { + "epoch": 5.4575645756457565, + "grad_norm": 0.7271884820294199, + "learning_rate": 5.009577188144188e-06, + "loss": 0.291, + "num_tokens": 81915318.0, + "step": 742 + }, + { + "epoch": 5.464944649446495, + "grad_norm": 0.760473112873417, + "learning_rate": 4.998842964757585e-06, + "loss": 0.3263, + "num_tokens": 82033948.0, + "step": 743 + }, + { + "epoch": 5.472324723247232, + "grad_norm": 0.7893622977330518, + "learning_rate": 4.98811162802062e-06, + "loss": 0.3152, + "num_tokens": 82179793.0, + "step": 744 + }, + { + "epoch": 5.479704797047971, + "grad_norm": 0.7019465607170431, + "learning_rate": 4.977383239745473e-06, + "loss": 0.3124, + "num_tokens": 82289619.0, + "step": 745 + }, + { + "epoch": 5.487084870848708, + "grad_norm": 0.8250003474119553, + "learning_rate": 4.9666578617273385e-06, + "loss": 0.3216, + "num_tokens": 82406037.0, + "step": 746 + }, + { + "epoch": 5.4944649446494465, + "grad_norm": 0.6795317303820285, + "learning_rate": 4.95593555574408e-06, + "loss": 0.2955, + "num_tokens": 82520422.0, + "step": 747 + }, + { + "epoch": 5.501845018450185, + "grad_norm": 0.8364977112949799, + "learning_rate": 4.945216383555861e-06, + "loss": 0.3216, + "num_tokens": 82610471.0, + "step": 748 + }, + { + "epoch": 5.509225092250922, + "grad_norm": 0.6878825020678628, + "learning_rate": 4.934500406904791e-06, + "loss": 0.2703, + "num_tokens": 82728500.0, + "step": 749 + }, + { + "epoch": 5.516605166051661, + "grad_norm": 0.7666698643359626, + "learning_rate": 4.923787687514583e-06, + "loss": 0.2858, + "num_tokens": 82836219.0, + "step": 750 + }, + { + "epoch": 5.523985239852399, + "grad_norm": 0.8127921752310425, + "learning_rate": 4.913078287090179e-06, + "loss": 0.2871, + "num_tokens": 82940103.0, + "step": 751 + }, + { + "epoch": 5.531365313653136, + "grad_norm": 0.8054768120172173, + "learning_rate": 4.902372267317405e-06, + "loss": 0.3016, + "num_tokens": 83025222.0, + "step": 752 + }, + { + "epoch": 5.538745387453875, + "grad_norm": 0.836243912493921, + "learning_rate": 4.891669689862622e-06, + "loss": 0.26, + "num_tokens": 83156808.0, + "step": 753 + }, + { + "epoch": 5.546125461254612, + "grad_norm": 0.7176219626456037, + "learning_rate": 4.880970616372357e-06, + "loss": 0.2925, + "num_tokens": 83240173.0, + "step": 754 + }, + { + "epoch": 5.553505535055351, + "grad_norm": 0.7513808908409143, + "learning_rate": 4.8702751084729515e-06, + "loss": 0.3037, + "num_tokens": 83398963.0, + "step": 755 + }, + { + "epoch": 5.560885608856088, + "grad_norm": 0.7860052138199383, + "learning_rate": 4.859583227770218e-06, + "loss": 0.2865, + "num_tokens": 83501594.0, + "step": 756 + }, + { + "epoch": 5.568265682656826, + "grad_norm": 0.7064769415440794, + "learning_rate": 4.848895035849069e-06, + "loss": 0.7912, + "num_tokens": 83620892.0, + "step": 757 + }, + { + "epoch": 5.575645756457565, + "grad_norm": 0.7332008728932892, + "learning_rate": 4.838210594273173e-06, + "loss": 0.2481, + "num_tokens": 83724229.0, + "step": 758 + }, + { + "epoch": 5.583025830258302, + "grad_norm": 0.7789765211913474, + "learning_rate": 4.827529964584597e-06, + "loss": 0.2908, + "num_tokens": 83815219.0, + "step": 759 + }, + { + "epoch": 5.590405904059041, + "grad_norm": 0.8722227692511327, + "learning_rate": 4.816853208303451e-06, + "loss": 0.3432, + "num_tokens": 83955057.0, + "step": 760 + }, + { + "epoch": 5.597785977859779, + "grad_norm": 0.6936633739164829, + "learning_rate": 4.8061803869275346e-06, + "loss": 0.3135, + "num_tokens": 84063496.0, + "step": 761 + }, + { + "epoch": 5.605166051660516, + "grad_norm": 0.7446073915999715, + "learning_rate": 4.795511561931979e-06, + "loss": 0.3309, + "num_tokens": 84194864.0, + "step": 762 + }, + { + "epoch": 5.612546125461255, + "grad_norm": 0.7682353671073113, + "learning_rate": 4.784846794768901e-06, + "loss": 0.3139, + "num_tokens": 84347886.0, + "step": 763 + }, + { + "epoch": 5.619926199261993, + "grad_norm": 0.8300487958678651, + "learning_rate": 4.7741861468670436e-06, + "loss": 0.3285, + "num_tokens": 84486118.0, + "step": 764 + }, + { + "epoch": 5.627306273062731, + "grad_norm": 0.7269550459131697, + "learning_rate": 4.76352967963142e-06, + "loss": 0.3205, + "num_tokens": 84602385.0, + "step": 765 + }, + { + "epoch": 5.634686346863469, + "grad_norm": 0.8167497040920041, + "learning_rate": 4.752877454442965e-06, + "loss": 0.2868, + "num_tokens": 84735813.0, + "step": 766 + }, + { + "epoch": 5.642066420664206, + "grad_norm": 0.8206917156476586, + "learning_rate": 4.742229532658181e-06, + "loss": 0.2631, + "num_tokens": 84820136.0, + "step": 767 + }, + { + "epoch": 5.649446494464945, + "grad_norm": 0.7621364635308342, + "learning_rate": 4.731585975608781e-06, + "loss": 0.311, + "num_tokens": 84962252.0, + "step": 768 + }, + { + "epoch": 5.656826568265682, + "grad_norm": 0.838977616803826, + "learning_rate": 4.7209468446013376e-06, + "loss": 0.2549, + "num_tokens": 85036308.0, + "step": 769 + }, + { + "epoch": 5.6642066420664205, + "grad_norm": 0.6747749021099678, + "learning_rate": 4.710312200916929e-06, + "loss": 0.3036, + "num_tokens": 85154854.0, + "step": 770 + }, + { + "epoch": 5.671586715867159, + "grad_norm": 0.7989285937342706, + "learning_rate": 4.699682105810786e-06, + "loss": 0.2443, + "num_tokens": 85280088.0, + "step": 771 + }, + { + "epoch": 5.678966789667896, + "grad_norm": 0.7371279613804671, + "learning_rate": 4.68905662051194e-06, + "loss": 0.2769, + "num_tokens": 85357886.0, + "step": 772 + }, + { + "epoch": 5.686346863468635, + "grad_norm": 0.7841880701337192, + "learning_rate": 4.678435806222873e-06, + "loss": 0.2655, + "num_tokens": 85472569.0, + "step": 773 + }, + { + "epoch": 5.693726937269373, + "grad_norm": 0.670554410834585, + "learning_rate": 4.667819724119159e-06, + "loss": 0.26, + "num_tokens": 85591197.0, + "step": 774 + }, + { + "epoch": 5.7011070110701105, + "grad_norm": 0.7609022381843286, + "learning_rate": 4.657208435349114e-06, + "loss": 0.2163, + "num_tokens": 85685049.0, + "step": 775 + }, + { + "epoch": 5.708487084870849, + "grad_norm": 0.7361212837503709, + "learning_rate": 4.64660200103345e-06, + "loss": 0.2669, + "num_tokens": 85791889.0, + "step": 776 + }, + { + "epoch": 5.715867158671586, + "grad_norm": 0.8126259109165639, + "learning_rate": 4.63600048226491e-06, + "loss": 0.3047, + "num_tokens": 85873668.0, + "step": 777 + }, + { + "epoch": 5.723247232472325, + "grad_norm": 0.7536847458523733, + "learning_rate": 4.625403940107929e-06, + "loss": 0.6545, + "num_tokens": 85977141.0, + "step": 778 + }, + { + "epoch": 5.730627306273063, + "grad_norm": 0.8390122703405435, + "learning_rate": 4.614812435598275e-06, + "loss": 0.2571, + "num_tokens": 86100651.0, + "step": 779 + }, + { + "epoch": 5.7380073800738005, + "grad_norm": 0.7184304402233668, + "learning_rate": 4.604226029742697e-06, + "loss": 0.244, + "num_tokens": 86208137.0, + "step": 780 + }, + { + "epoch": 5.745387453874539, + "grad_norm": 0.7598391769774648, + "learning_rate": 4.593644783518581e-06, + "loss": 0.2557, + "num_tokens": 86318101.0, + "step": 781 + }, + { + "epoch": 5.752767527675276, + "grad_norm": 0.7065705531981052, + "learning_rate": 4.58306875787359e-06, + "loss": 0.2954, + "num_tokens": 86469562.0, + "step": 782 + }, + { + "epoch": 5.760147601476015, + "grad_norm": 0.6639915141423608, + "learning_rate": 4.572498013725319e-06, + "loss": 0.2974, + "num_tokens": 86572873.0, + "step": 783 + }, + { + "epoch": 5.767527675276753, + "grad_norm": 0.8612338852207884, + "learning_rate": 4.561932611960935e-06, + "loss": 0.3595, + "num_tokens": 86705910.0, + "step": 784 + }, + { + "epoch": 5.7749077490774905, + "grad_norm": 0.7365204739437818, + "learning_rate": 4.551372613436845e-06, + "loss": 0.2591, + "num_tokens": 86806358.0, + "step": 785 + }, + { + "epoch": 5.782287822878229, + "grad_norm": 0.7811812979489162, + "learning_rate": 4.540818078978324e-06, + "loss": 0.3246, + "num_tokens": 86898194.0, + "step": 786 + }, + { + "epoch": 5.789667896678967, + "grad_norm": 0.8300575223005775, + "learning_rate": 4.5302690693791785e-06, + "loss": 0.2407, + "num_tokens": 87006345.0, + "step": 787 + }, + { + "epoch": 5.797047970479705, + "grad_norm": 0.8086907029628351, + "learning_rate": 4.519725645401387e-06, + "loss": 0.2785, + "num_tokens": 87117120.0, + "step": 788 + }, + { + "epoch": 5.804428044280443, + "grad_norm": 0.6746765611467019, + "learning_rate": 4.509187867774762e-06, + "loss": 0.5363, + "num_tokens": 87251836.0, + "step": 789 + }, + { + "epoch": 5.8118081180811805, + "grad_norm": 0.8287931036381043, + "learning_rate": 4.4986557971965865e-06, + "loss": 0.2669, + "num_tokens": 87357495.0, + "step": 790 + }, + { + "epoch": 5.819188191881919, + "grad_norm": 0.6706419685085858, + "learning_rate": 4.488129494331276e-06, + "loss": 0.2355, + "num_tokens": 87446459.0, + "step": 791 + }, + { + "epoch": 5.826568265682657, + "grad_norm": 0.786872973851237, + "learning_rate": 4.477609019810022e-06, + "loss": 0.2984, + "num_tokens": 87549420.0, + "step": 792 + }, + { + "epoch": 5.833948339483395, + "grad_norm": 0.7754342944870574, + "learning_rate": 4.467094434230445e-06, + "loss": 0.2939, + "num_tokens": 87646947.0, + "step": 793 + }, + { + "epoch": 5.841328413284133, + "grad_norm": 0.8200545823833467, + "learning_rate": 4.456585798156246e-06, + "loss": 0.2938, + "num_tokens": 87764542.0, + "step": 794 + }, + { + "epoch": 5.8487084870848705, + "grad_norm": 0.7562355763532292, + "learning_rate": 4.446083172116858e-06, + "loss": 0.302, + "num_tokens": 87872220.0, + "step": 795 + }, + { + "epoch": 5.856088560885609, + "grad_norm": 0.9046034705903128, + "learning_rate": 4.435586616607094e-06, + "loss": 0.3068, + "num_tokens": 87971032.0, + "step": 796 + }, + { + "epoch": 5.863468634686347, + "grad_norm": 0.7704842545134923, + "learning_rate": 4.4250961920868005e-06, + "loss": 0.2637, + "num_tokens": 88064195.0, + "step": 797 + }, + { + "epoch": 5.870848708487085, + "grad_norm": 0.7742359193055628, + "learning_rate": 4.414611958980512e-06, + "loss": 0.2681, + "num_tokens": 88172050.0, + "step": 798 + }, + { + "epoch": 5.878228782287823, + "grad_norm": 0.7056198246854335, + "learning_rate": 4.404133977677101e-06, + "loss": 0.2496, + "num_tokens": 88300724.0, + "step": 799 + }, + { + "epoch": 5.885608856088561, + "grad_norm": 0.7125491273298457, + "learning_rate": 4.393662308529427e-06, + "loss": 0.2427, + "num_tokens": 88431435.0, + "step": 800 + }, + { + "epoch": 5.892988929889299, + "grad_norm": 0.6511055726535395, + "learning_rate": 4.383197011853993e-06, + "loss": 0.3121, + "num_tokens": 88549593.0, + "step": 801 + }, + { + "epoch": 5.900369003690037, + "grad_norm": 0.9074280458375835, + "learning_rate": 4.372738147930599e-06, + "loss": 0.2847, + "num_tokens": 88643972.0, + "step": 802 + }, + { + "epoch": 5.907749077490775, + "grad_norm": 0.7638665779652982, + "learning_rate": 4.362285777001989e-06, + "loss": 0.3132, + "num_tokens": 88769522.0, + "step": 803 + }, + { + "epoch": 5.915129151291513, + "grad_norm": 0.8305226024562709, + "learning_rate": 4.35183995927351e-06, + "loss": 0.2635, + "num_tokens": 88859548.0, + "step": 804 + }, + { + "epoch": 5.922509225092251, + "grad_norm": 0.9231553389167491, + "learning_rate": 4.34140075491276e-06, + "loss": 0.3237, + "num_tokens": 88951297.0, + "step": 805 + }, + { + "epoch": 5.929889298892989, + "grad_norm": 0.6204930789359487, + "learning_rate": 4.330968224049248e-06, + "loss": 0.3277, + "num_tokens": 89077101.0, + "step": 806 + }, + { + "epoch": 5.937269372693727, + "grad_norm": 0.7903970100872257, + "learning_rate": 4.320542426774042e-06, + "loss": 0.2492, + "num_tokens": 89162461.0, + "step": 807 + }, + { + "epoch": 5.944649446494465, + "grad_norm": 0.7374651281826408, + "learning_rate": 4.310123423139422e-06, + "loss": 0.3697, + "num_tokens": 89267370.0, + "step": 808 + }, + { + "epoch": 5.952029520295203, + "grad_norm": 0.6716770325771486, + "learning_rate": 4.299711273158542e-06, + "loss": 0.2777, + "num_tokens": 89408407.0, + "step": 809 + }, + { + "epoch": 5.959409594095941, + "grad_norm": 0.8587406649423595, + "learning_rate": 4.289306036805077e-06, + "loss": 0.2774, + "num_tokens": 89497445.0, + "step": 810 + }, + { + "epoch": 5.966789667896679, + "grad_norm": 0.8292847782488489, + "learning_rate": 4.278907774012876e-06, + "loss": 0.2557, + "num_tokens": 89617678.0, + "step": 811 + }, + { + "epoch": 5.974169741697417, + "grad_norm": 0.7438193152871864, + "learning_rate": 4.268516544675628e-06, + "loss": 0.2441, + "num_tokens": 89714542.0, + "step": 812 + }, + { + "epoch": 5.9815498154981555, + "grad_norm": 0.6870379095079707, + "learning_rate": 4.258132408646503e-06, + "loss": 0.2228, + "num_tokens": 89823849.0, + "step": 813 + }, + { + "epoch": 5.988929889298893, + "grad_norm": 0.7013293356784026, + "learning_rate": 4.247755425737816e-06, + "loss": 0.2735, + "num_tokens": 89919377.0, + "step": 814 + }, + { + "epoch": 5.996309963099631, + "grad_norm": 0.773828181140405, + "learning_rate": 4.237385655720681e-06, + "loss": 0.3245, + "num_tokens": 90038971.0, + "step": 815 + }, + { + "epoch": 6.0, + "grad_norm": 1.2173223861261109, + "learning_rate": 4.227023158324666e-06, + "loss": 0.2545, + "num_tokens": 90106097.0, + "step": 816 + }, + { + "epoch": 6.007380073800738, + "grad_norm": 0.754619177850811, + "learning_rate": 4.216667993237445e-06, + "loss": 0.2196, + "num_tokens": 90213239.0, + "step": 817 + }, + { + "epoch": 6.014760147601476, + "grad_norm": 0.7501441529725348, + "learning_rate": 4.206320220104464e-06, + "loss": 0.2059, + "num_tokens": 90306923.0, + "step": 818 + }, + { + "epoch": 6.022140221402214, + "grad_norm": 0.823773233978655, + "learning_rate": 4.195979898528589e-06, + "loss": 0.2554, + "num_tokens": 90439883.0, + "step": 819 + }, + { + "epoch": 6.029520295202952, + "grad_norm": 0.6126983908864717, + "learning_rate": 4.185647088069765e-06, + "loss": 0.4358, + "num_tokens": 90610253.0, + "step": 820 + }, + { + "epoch": 6.03690036900369, + "grad_norm": 0.8108549742677466, + "learning_rate": 4.175321848244673e-06, + "loss": 0.1915, + "num_tokens": 90701249.0, + "step": 821 + }, + { + "epoch": 6.044280442804428, + "grad_norm": 0.8821882563133362, + "learning_rate": 4.165004238526388e-06, + "loss": 0.2422, + "num_tokens": 90816090.0, + "step": 822 + }, + { + "epoch": 6.051660516605166, + "grad_norm": 1.1293556042513082, + "learning_rate": 4.1546943183440344e-06, + "loss": 0.2124, + "num_tokens": 90908059.0, + "step": 823 + }, + { + "epoch": 6.059040590405904, + "grad_norm": 1.1286542477044417, + "learning_rate": 4.144392147082448e-06, + "loss": 0.198, + "num_tokens": 90988560.0, + "step": 824 + }, + { + "epoch": 6.0664206642066425, + "grad_norm": 0.9270382141381814, + "learning_rate": 4.134097784081826e-06, + "loss": 0.1983, + "num_tokens": 91093452.0, + "step": 825 + }, + { + "epoch": 6.07380073800738, + "grad_norm": 0.8295770283434506, + "learning_rate": 4.123811288637397e-06, + "loss": 0.1738, + "num_tokens": 91212659.0, + "step": 826 + }, + { + "epoch": 6.081180811808118, + "grad_norm": 0.7985117078350946, + "learning_rate": 4.113532719999067e-06, + "loss": 0.1932, + "num_tokens": 91327661.0, + "step": 827 + }, + { + "epoch": 6.088560885608856, + "grad_norm": 0.7313065345505175, + "learning_rate": 4.103262137371087e-06, + "loss": 0.2618, + "num_tokens": 91460086.0, + "step": 828 + }, + { + "epoch": 6.095940959409594, + "grad_norm": 0.7559559275244944, + "learning_rate": 4.0929995999117085e-06, + "loss": 0.2455, + "num_tokens": 91569256.0, + "step": 829 + }, + { + "epoch": 6.1033210332103325, + "grad_norm": 0.8028774258101546, + "learning_rate": 4.082745166732842e-06, + "loss": 0.2247, + "num_tokens": 91675596.0, + "step": 830 + }, + { + "epoch": 6.11070110701107, + "grad_norm": 0.7499423102410786, + "learning_rate": 4.072498896899718e-06, + "loss": 0.2294, + "num_tokens": 91781549.0, + "step": 831 + }, + { + "epoch": 6.118081180811808, + "grad_norm": 0.6529902129393093, + "learning_rate": 4.062260849430546e-06, + "loss": 0.2013, + "num_tokens": 91927478.0, + "step": 832 + }, + { + "epoch": 6.125461254612546, + "grad_norm": 0.6601754149059782, + "learning_rate": 4.052031083296175e-06, + "loss": 0.1806, + "num_tokens": 92041926.0, + "step": 833 + }, + { + "epoch": 6.132841328413284, + "grad_norm": 0.7415551312116556, + "learning_rate": 4.041809657419755e-06, + "loss": 0.1921, + "num_tokens": 92167319.0, + "step": 834 + }, + { + "epoch": 6.1402214022140225, + "grad_norm": 0.7601195859487445, + "learning_rate": 4.031596630676394e-06, + "loss": 0.2371, + "num_tokens": 92286885.0, + "step": 835 + }, + { + "epoch": 6.14760147601476, + "grad_norm": 0.7380371385249106, + "learning_rate": 4.021392061892824e-06, + "loss": 0.2135, + "num_tokens": 92389683.0, + "step": 836 + }, + { + "epoch": 6.154981549815498, + "grad_norm": 0.7925402829629689, + "learning_rate": 4.011196009847057e-06, + "loss": 0.1935, + "num_tokens": 92496539.0, + "step": 837 + }, + { + "epoch": 6.162361623616236, + "grad_norm": 0.8074333970201613, + "learning_rate": 4.001008533268052e-06, + "loss": 0.2198, + "num_tokens": 92601836.0, + "step": 838 + }, + { + "epoch": 6.169741697416974, + "grad_norm": 0.8638114016631451, + "learning_rate": 3.990829690835369e-06, + "loss": 0.1886, + "num_tokens": 92693125.0, + "step": 839 + }, + { + "epoch": 6.177121771217712, + "grad_norm": 0.9153865801186486, + "learning_rate": 3.980659541178842e-06, + "loss": 0.2108, + "num_tokens": 92778023.0, + "step": 840 + }, + { + "epoch": 6.18450184501845, + "grad_norm": 0.711842304748427, + "learning_rate": 3.970498142878226e-06, + "loss": 0.1809, + "num_tokens": 92897683.0, + "step": 841 + }, + { + "epoch": 6.191881918819188, + "grad_norm": 0.6796198153078575, + "learning_rate": 3.9603455544628754e-06, + "loss": 0.1942, + "num_tokens": 93011980.0, + "step": 842 + }, + { + "epoch": 6.199261992619927, + "grad_norm": 0.7385851732118687, + "learning_rate": 3.9502018344113975e-06, + "loss": 0.2006, + "num_tokens": 93136769.0, + "step": 843 + }, + { + "epoch": 6.206642066420664, + "grad_norm": 0.7724145049291772, + "learning_rate": 3.94006704115132e-06, + "loss": 0.2243, + "num_tokens": 93233152.0, + "step": 844 + }, + { + "epoch": 6.214022140221402, + "grad_norm": 0.8193526198793998, + "learning_rate": 3.9299412330587504e-06, + "loss": 0.2172, + "num_tokens": 93337100.0, + "step": 845 + }, + { + "epoch": 6.22140221402214, + "grad_norm": 0.8250040421131153, + "learning_rate": 3.919824468458041e-06, + "loss": 0.2384, + "num_tokens": 93501961.0, + "step": 846 + }, + { + "epoch": 6.228782287822878, + "grad_norm": 0.743900475287342, + "learning_rate": 3.909716805621459e-06, + "loss": 0.2542, + "num_tokens": 93634298.0, + "step": 847 + }, + { + "epoch": 6.236162361623617, + "grad_norm": 0.7646462253424315, + "learning_rate": 3.899618302768837e-06, + "loss": 0.2076, + "num_tokens": 93746723.0, + "step": 848 + }, + { + "epoch": 6.243542435424354, + "grad_norm": 0.7762869644495579, + "learning_rate": 3.889529018067256e-06, + "loss": 0.7056, + "num_tokens": 93884297.0, + "step": 849 + }, + { + "epoch": 6.250922509225092, + "grad_norm": 0.80931274189963, + "learning_rate": 3.879449009630694e-06, + "loss": 0.2629, + "num_tokens": 93990776.0, + "step": 850 + }, + { + "epoch": 6.25830258302583, + "grad_norm": 0.6833787978478929, + "learning_rate": 3.869378335519701e-06, + "loss": 0.2117, + "num_tokens": 94117404.0, + "step": 851 + }, + { + "epoch": 6.265682656826568, + "grad_norm": 0.7413250935105922, + "learning_rate": 3.8593170537410585e-06, + "loss": 0.1748, + "num_tokens": 94213518.0, + "step": 852 + }, + { + "epoch": 6.273062730627307, + "grad_norm": 0.7159824628287972, + "learning_rate": 3.849265222247452e-06, + "loss": 0.1929, + "num_tokens": 94334198.0, + "step": 853 + }, + { + "epoch": 6.280442804428044, + "grad_norm": 0.8765176099031716, + "learning_rate": 3.839222898937136e-06, + "loss": 0.212, + "num_tokens": 94421654.0, + "step": 854 + }, + { + "epoch": 6.287822878228782, + "grad_norm": 0.6509756542972459, + "learning_rate": 3.8291901416535895e-06, + "loss": 0.1778, + "num_tokens": 94534406.0, + "step": 855 + }, + { + "epoch": 6.29520295202952, + "grad_norm": 0.8400790274459855, + "learning_rate": 3.819167008185198e-06, + "loss": 0.2379, + "num_tokens": 94652173.0, + "step": 856 + }, + { + "epoch": 6.302583025830258, + "grad_norm": 0.9179393571701614, + "learning_rate": 3.809153556264914e-06, + "loss": 0.2516, + "num_tokens": 94749142.0, + "step": 857 + }, + { + "epoch": 6.3099630996309966, + "grad_norm": 0.8240154718085584, + "learning_rate": 3.7991498435699213e-06, + "loss": 0.4507, + "num_tokens": 94853046.0, + "step": 858 + }, + { + "epoch": 6.317343173431734, + "grad_norm": 0.7493697083757599, + "learning_rate": 3.7891559277213095e-06, + "loss": 0.1833, + "num_tokens": 94945423.0, + "step": 859 + }, + { + "epoch": 6.324723247232472, + "grad_norm": 0.990775891335437, + "learning_rate": 3.779171866283734e-06, + "loss": 0.2106, + "num_tokens": 95038518.0, + "step": 860 + }, + { + "epoch": 6.332103321033211, + "grad_norm": 0.8525222926707304, + "learning_rate": 3.7691977167650952e-06, + "loss": 0.2581, + "num_tokens": 95160151.0, + "step": 861 + }, + { + "epoch": 6.339483394833948, + "grad_norm": 0.6823449059636649, + "learning_rate": 3.759233536616197e-06, + "loss": 0.2158, + "num_tokens": 95275795.0, + "step": 862 + }, + { + "epoch": 6.3468634686346865, + "grad_norm": 0.7102219478305801, + "learning_rate": 3.749279383230421e-06, + "loss": 0.2193, + "num_tokens": 95392967.0, + "step": 863 + }, + { + "epoch": 6.354243542435424, + "grad_norm": 0.7387604183406936, + "learning_rate": 3.7393353139433952e-06, + "loss": 0.2265, + "num_tokens": 95514701.0, + "step": 864 + }, + { + "epoch": 6.361623616236162, + "grad_norm": 0.7600271999176372, + "learning_rate": 3.729401386032663e-06, + "loss": 0.2192, + "num_tokens": 95617043.0, + "step": 865 + }, + { + "epoch": 6.369003690036901, + "grad_norm": 0.8308997001060889, + "learning_rate": 3.719477656717355e-06, + "loss": 0.1975, + "num_tokens": 95703514.0, + "step": 866 + }, + { + "epoch": 6.376383763837638, + "grad_norm": 0.7156260219335532, + "learning_rate": 3.7095641831578567e-06, + "loss": 0.2035, + "num_tokens": 95812288.0, + "step": 867 + }, + { + "epoch": 6.3837638376383765, + "grad_norm": 0.8875140819742275, + "learning_rate": 3.699661022455482e-06, + "loss": 0.1898, + "num_tokens": 95901945.0, + "step": 868 + }, + { + "epoch": 6.391143911439114, + "grad_norm": 0.6696637884259364, + "learning_rate": 3.689768231652141e-06, + "loss": 0.2058, + "num_tokens": 96049902.0, + "step": 869 + }, + { + "epoch": 6.398523985239852, + "grad_norm": 0.7905426177757932, + "learning_rate": 3.6798858677300143e-06, + "loss": 0.1587, + "num_tokens": 96153520.0, + "step": 870 + }, + { + "epoch": 6.405904059040591, + "grad_norm": 0.8013809496278096, + "learning_rate": 3.670013987611226e-06, + "loss": 0.1917, + "num_tokens": 96252026.0, + "step": 871 + }, + { + "epoch": 6.413284132841328, + "grad_norm": 0.8162319784741491, + "learning_rate": 3.6601526481575133e-06, + "loss": 0.2048, + "num_tokens": 96348710.0, + "step": 872 + }, + { + "epoch": 6.4206642066420665, + "grad_norm": 0.7328529544115915, + "learning_rate": 3.650301906169896e-06, + "loss": 0.1907, + "num_tokens": 96440569.0, + "step": 873 + }, + { + "epoch": 6.428044280442805, + "grad_norm": 0.8296301704513374, + "learning_rate": 3.640461818388359e-06, + "loss": 0.1968, + "num_tokens": 96532079.0, + "step": 874 + }, + { + "epoch": 6.435424354243542, + "grad_norm": 0.6444682162510676, + "learning_rate": 3.630632441491512e-06, + "loss": 0.1823, + "num_tokens": 96643844.0, + "step": 875 + }, + { + "epoch": 6.442804428044281, + "grad_norm": 0.744390482720809, + "learning_rate": 3.620813832096275e-06, + "loss": 0.2158, + "num_tokens": 96758044.0, + "step": 876 + }, + { + "epoch": 6.450184501845018, + "grad_norm": 0.7123681226464863, + "learning_rate": 3.611006046757547e-06, + "loss": 0.2033, + "num_tokens": 96862371.0, + "step": 877 + }, + { + "epoch": 6.4575645756457565, + "grad_norm": 0.7526500553085586, + "learning_rate": 3.6012091419678808e-06, + "loss": 0.1947, + "num_tokens": 96981106.0, + "step": 878 + }, + { + "epoch": 6.464944649446495, + "grad_norm": 0.7614999330943449, + "learning_rate": 3.591423174157154e-06, + "loss": 0.1645, + "num_tokens": 97107831.0, + "step": 879 + }, + { + "epoch": 6.472324723247232, + "grad_norm": 0.7011776983112978, + "learning_rate": 3.581648199692255e-06, + "loss": 0.2517, + "num_tokens": 97246067.0, + "step": 880 + }, + { + "epoch": 6.479704797047971, + "grad_norm": 0.6553053730395717, + "learning_rate": 3.5718842748767447e-06, + "loss": 0.2208, + "num_tokens": 97385229.0, + "step": 881 + }, + { + "epoch": 6.487084870848708, + "grad_norm": 0.8167144132408032, + "learning_rate": 3.5621314559505383e-06, + "loss": 0.2525, + "num_tokens": 97528757.0, + "step": 882 + }, + { + "epoch": 6.4944649446494465, + "grad_norm": 0.7398027869252124, + "learning_rate": 3.552389799089584e-06, + "loss": 0.1958, + "num_tokens": 97642105.0, + "step": 883 + }, + { + "epoch": 6.501845018450185, + "grad_norm": 0.8569939875013843, + "learning_rate": 3.542659360405537e-06, + "loss": 0.2649, + "num_tokens": 97742503.0, + "step": 884 + }, + { + "epoch": 6.509225092250922, + "grad_norm": 0.6510247379643058, + "learning_rate": 3.5329401959454348e-06, + "loss": 0.152, + "num_tokens": 97849892.0, + "step": 885 + }, + { + "epoch": 6.516605166051661, + "grad_norm": 0.8610923117709987, + "learning_rate": 3.5232323616913745e-06, + "loss": 0.208, + "num_tokens": 97943605.0, + "step": 886 + }, + { + "epoch": 6.523985239852399, + "grad_norm": 0.786724364976123, + "learning_rate": 3.513535913560194e-06, + "loss": 0.1973, + "num_tokens": 98032889.0, + "step": 887 + }, + { + "epoch": 6.531365313653136, + "grad_norm": 0.6947759355032901, + "learning_rate": 3.5038509074031444e-06, + "loss": 0.1906, + "num_tokens": 98150384.0, + "step": 888 + }, + { + "epoch": 6.538745387453875, + "grad_norm": 0.6696060626479275, + "learning_rate": 3.4941773990055784e-06, + "loss": 0.2248, + "num_tokens": 98287503.0, + "step": 889 + }, + { + "epoch": 6.546125461254612, + "grad_norm": 0.7507830986698222, + "learning_rate": 3.4845154440866137e-06, + "loss": 0.1848, + "num_tokens": 98381090.0, + "step": 890 + }, + { + "epoch": 6.553505535055351, + "grad_norm": 0.7553275368445391, + "learning_rate": 3.4748650982988245e-06, + "loss": 0.2097, + "num_tokens": 98516065.0, + "step": 891 + }, + { + "epoch": 6.560885608856088, + "grad_norm": 0.8156256331640636, + "learning_rate": 3.4652264172279153e-06, + "loss": 0.178, + "num_tokens": 98601146.0, + "step": 892 + }, + { + "epoch": 6.568265682656826, + "grad_norm": 0.7603646969200515, + "learning_rate": 3.4555994563924034e-06, + "loss": 0.2084, + "num_tokens": 98721470.0, + "step": 893 + }, + { + "epoch": 6.575645756457565, + "grad_norm": 0.7577945918254975, + "learning_rate": 3.4459842712432957e-06, + "loss": 0.1636, + "num_tokens": 98820078.0, + "step": 894 + }, + { + "epoch": 6.583025830258302, + "grad_norm": 0.8005240275250651, + "learning_rate": 3.436380917163775e-06, + "loss": 0.2031, + "num_tokens": 98943791.0, + "step": 895 + }, + { + "epoch": 6.590405904059041, + "grad_norm": 0.799801084880718, + "learning_rate": 3.4267894494688735e-06, + "loss": 0.2296, + "num_tokens": 99082296.0, + "step": 896 + }, + { + "epoch": 6.597785977859779, + "grad_norm": 0.7522478657943151, + "learning_rate": 3.417209923405163e-06, + "loss": 0.2294, + "num_tokens": 99223930.0, + "step": 897 + }, + { + "epoch": 6.605166051660516, + "grad_norm": 0.7058009301544409, + "learning_rate": 3.407642394150429e-06, + "loss": 0.1766, + "num_tokens": 99344618.0, + "step": 898 + }, + { + "epoch": 6.612546125461255, + "grad_norm": 0.8022735323233764, + "learning_rate": 3.3980869168133533e-06, + "loss": 0.1995, + "num_tokens": 99463898.0, + "step": 899 + }, + { + "epoch": 6.619926199261993, + "grad_norm": 0.7663300241940983, + "learning_rate": 3.3885435464332028e-06, + "loss": 0.1818, + "num_tokens": 99570004.0, + "step": 900 + }, + { + "epoch": 6.627306273062731, + "grad_norm": 0.7523025442584772, + "learning_rate": 3.379012337979507e-06, + "loss": 0.1806, + "num_tokens": 99684132.0, + "step": 901 + }, + { + "epoch": 6.634686346863469, + "grad_norm": 0.7324752348611164, + "learning_rate": 3.3694933463517443e-06, + "loss": 0.2474, + "num_tokens": 99805955.0, + "step": 902 + }, + { + "epoch": 6.642066420664206, + "grad_norm": 0.8812132921472331, + "learning_rate": 3.3599866263790227e-06, + "loss": 0.2032, + "num_tokens": 99902512.0, + "step": 903 + }, + { + "epoch": 6.649446494464945, + "grad_norm": 0.7770802676786434, + "learning_rate": 3.3504922328197675e-06, + "loss": 0.1704, + "num_tokens": 100013792.0, + "step": 904 + }, + { + "epoch": 6.656826568265682, + "grad_norm": 0.9755321311222953, + "learning_rate": 3.3410102203614024e-06, + "loss": 0.2631, + "num_tokens": 100119726.0, + "step": 905 + }, + { + "epoch": 6.6642066420664205, + "grad_norm": 0.6557407401122956, + "learning_rate": 3.331540643620039e-06, + "loss": 0.1933, + "num_tokens": 100245125.0, + "step": 906 + }, + { + "epoch": 6.671586715867159, + "grad_norm": 0.7389180742131263, + "learning_rate": 3.322083557140159e-06, + "loss": 0.2223, + "num_tokens": 100383345.0, + "step": 907 + }, + { + "epoch": 6.678966789667896, + "grad_norm": 0.7429765561465446, + "learning_rate": 3.3126390153942977e-06, + "loss": 0.2308, + "num_tokens": 100502064.0, + "step": 908 + }, + { + "epoch": 6.686346863468635, + "grad_norm": 0.8927199535053768, + "learning_rate": 3.3032070727827358e-06, + "loss": 0.2295, + "num_tokens": 100582734.0, + "step": 909 + }, + { + "epoch": 6.693726937269373, + "grad_norm": 0.876892615200498, + "learning_rate": 3.293787783633182e-06, + "loss": 0.1841, + "num_tokens": 100673332.0, + "step": 910 + }, + { + "epoch": 6.7011070110701105, + "grad_norm": 0.6604620029571926, + "learning_rate": 3.2843812022004606e-06, + "loss": 0.1542, + "num_tokens": 100765299.0, + "step": 911 + }, + { + "epoch": 6.708487084870849, + "grad_norm": 0.7407674652150991, + "learning_rate": 3.2749873826662047e-06, + "loss": 0.2064, + "num_tokens": 100882033.0, + "step": 912 + }, + { + "epoch": 6.715867158671586, + "grad_norm": 0.86708507778245, + "learning_rate": 3.265606379138534e-06, + "loss": 0.1649, + "num_tokens": 100963233.0, + "step": 913 + }, + { + "epoch": 6.723247232472325, + "grad_norm": 0.8608589329839527, + "learning_rate": 3.2562382456517495e-06, + "loss": 0.2295, + "num_tokens": 101055152.0, + "step": 914 + }, + { + "epoch": 6.730627306273063, + "grad_norm": 0.9392155344063237, + "learning_rate": 3.246883036166023e-06, + "loss": 0.1896, + "num_tokens": 101137124.0, + "step": 915 + }, + { + "epoch": 6.7380073800738005, + "grad_norm": 0.9396451567910546, + "learning_rate": 3.2375408045670836e-06, + "loss": 0.2353, + "num_tokens": 101239694.0, + "step": 916 + }, + { + "epoch": 6.745387453874539, + "grad_norm": 0.7896628007083686, + "learning_rate": 3.228211604665907e-06, + "loss": 0.1686, + "num_tokens": 101342164.0, + "step": 917 + }, + { + "epoch": 6.752767527675276, + "grad_norm": 0.8310372293464977, + "learning_rate": 3.218895490198407e-06, + "loss": 0.2091, + "num_tokens": 101448880.0, + "step": 918 + }, + { + "epoch": 6.760147601476015, + "grad_norm": 0.7450885761579481, + "learning_rate": 3.2095925148251273e-06, + "loss": 0.1777, + "num_tokens": 101546758.0, + "step": 919 + }, + { + "epoch": 6.767527675276753, + "grad_norm": 0.7199744105820569, + "learning_rate": 3.2003027321309287e-06, + "loss": 0.151, + "num_tokens": 101650068.0, + "step": 920 + }, + { + "epoch": 6.7749077490774905, + "grad_norm": 0.674624666564349, + "learning_rate": 3.1910261956246845e-06, + "loss": 0.2322, + "num_tokens": 101770709.0, + "step": 921 + }, + { + "epoch": 6.782287822878229, + "grad_norm": 0.878038648852318, + "learning_rate": 3.1817629587389675e-06, + "loss": 0.2003, + "num_tokens": 101857734.0, + "step": 922 + }, + { + "epoch": 6.789667896678967, + "grad_norm": 0.7892653869940014, + "learning_rate": 3.17251307482975e-06, + "loss": 0.3376, + "num_tokens": 101956249.0, + "step": 923 + }, + { + "epoch": 6.797047970479705, + "grad_norm": 0.7905826098236308, + "learning_rate": 3.1632765971760875e-06, + "loss": 0.2093, + "num_tokens": 102044561.0, + "step": 924 + }, + { + "epoch": 6.804428044280443, + "grad_norm": 0.8891968097952797, + "learning_rate": 3.1540535789798168e-06, + "loss": 0.2193, + "num_tokens": 102137661.0, + "step": 925 + }, + { + "epoch": 6.8118081180811805, + "grad_norm": 0.7500480325072951, + "learning_rate": 3.144844073365247e-06, + "loss": 0.2044, + "num_tokens": 102247189.0, + "step": 926 + }, + { + "epoch": 6.819188191881919, + "grad_norm": 0.6365731589988368, + "learning_rate": 3.135648133378859e-06, + "loss": 0.1924, + "num_tokens": 102359037.0, + "step": 927 + }, + { + "epoch": 6.826568265682657, + "grad_norm": 0.7028378561369589, + "learning_rate": 3.126465811988994e-06, + "loss": 0.2335, + "num_tokens": 102506594.0, + "step": 928 + }, + { + "epoch": 6.833948339483395, + "grad_norm": 0.7532976280521092, + "learning_rate": 3.1172971620855477e-06, + "loss": 0.2093, + "num_tokens": 102647771.0, + "step": 929 + }, + { + "epoch": 6.841328413284133, + "grad_norm": 0.734868379726991, + "learning_rate": 3.108142236479675e-06, + "loss": 0.2066, + "num_tokens": 102759707.0, + "step": 930 + }, + { + "epoch": 6.8487084870848705, + "grad_norm": 0.7295400190719857, + "learning_rate": 3.099001087903473e-06, + "loss": 0.1763, + "num_tokens": 102887449.0, + "step": 931 + }, + { + "epoch": 6.856088560885609, + "grad_norm": 0.6572139423314011, + "learning_rate": 3.0898737690096857e-06, + "loss": 0.1564, + "num_tokens": 103005173.0, + "step": 932 + }, + { + "epoch": 6.863468634686347, + "grad_norm": 0.8052758067274763, + "learning_rate": 3.080760332371402e-06, + "loss": 0.2305, + "num_tokens": 103137118.0, + "step": 933 + }, + { + "epoch": 6.870848708487085, + "grad_norm": 0.7881555315391795, + "learning_rate": 3.071660830481743e-06, + "loss": 0.4904, + "num_tokens": 103270118.0, + "step": 934 + }, + { + "epoch": 6.878228782287823, + "grad_norm": 0.7603387680036207, + "learning_rate": 3.062575315753571e-06, + "loss": 0.1851, + "num_tokens": 103376975.0, + "step": 935 + }, + { + "epoch": 6.885608856088561, + "grad_norm": 0.7912312318357577, + "learning_rate": 3.0535038405191804e-06, + "loss": 0.1821, + "num_tokens": 103446424.0, + "step": 936 + }, + { + "epoch": 6.892988929889299, + "grad_norm": 0.904347558736632, + "learning_rate": 3.0444464570299992e-06, + "loss": 0.2031, + "num_tokens": 103533858.0, + "step": 937 + }, + { + "epoch": 6.900369003690037, + "grad_norm": 0.730765026740344, + "learning_rate": 3.0354032174562864e-06, + "loss": 0.1827, + "num_tokens": 103624447.0, + "step": 938 + }, + { + "epoch": 6.907749077490775, + "grad_norm": 0.6950005683139954, + "learning_rate": 3.0263741738868348e-06, + "loss": 0.1797, + "num_tokens": 103739077.0, + "step": 939 + }, + { + "epoch": 6.915129151291513, + "grad_norm": 0.7767121805718566, + "learning_rate": 3.0173593783286644e-06, + "loss": 0.2039, + "num_tokens": 103838997.0, + "step": 940 + }, + { + "epoch": 6.922509225092251, + "grad_norm": 0.7312298513437542, + "learning_rate": 3.0083588827067334e-06, + "loss": 0.1762, + "num_tokens": 103934375.0, + "step": 941 + }, + { + "epoch": 6.929889298892989, + "grad_norm": 0.7258774991287039, + "learning_rate": 2.999372738863627e-06, + "loss": 0.1941, + "num_tokens": 104057875.0, + "step": 942 + }, + { + "epoch": 6.937269372693727, + "grad_norm": 0.906804533467632, + "learning_rate": 2.9904009985592685e-06, + "loss": 0.182, + "num_tokens": 104149323.0, + "step": 943 + }, + { + "epoch": 6.944649446494465, + "grad_norm": 0.7906694700640292, + "learning_rate": 2.981443713470614e-06, + "loss": 0.1898, + "num_tokens": 104234856.0, + "step": 944 + }, + { + "epoch": 6.952029520295203, + "grad_norm": 0.7908183194822428, + "learning_rate": 2.972500935191361e-06, + "loss": 0.2914, + "num_tokens": 104400351.0, + "step": 945 + }, + { + "epoch": 6.959409594095941, + "grad_norm": 0.782710003770315, + "learning_rate": 2.963572715231645e-06, + "loss": 0.2288, + "num_tokens": 104502341.0, + "step": 946 + }, + { + "epoch": 6.966789667896679, + "grad_norm": 0.6924748220617756, + "learning_rate": 2.9546591050177475e-06, + "loss": 0.2123, + "num_tokens": 104642099.0, + "step": 947 + }, + { + "epoch": 6.974169741697417, + "grad_norm": 0.7826865188420538, + "learning_rate": 2.9457601558918e-06, + "loss": 0.2013, + "num_tokens": 104750363.0, + "step": 948 + }, + { + "epoch": 6.9815498154981555, + "grad_norm": 0.880677527246301, + "learning_rate": 2.936875919111485e-06, + "loss": 0.1964, + "num_tokens": 104834582.0, + "step": 949 + }, + { + "epoch": 6.988929889298893, + "grad_norm": 0.7502521154320814, + "learning_rate": 2.928006445849743e-06, + "loss": 0.178, + "num_tokens": 104941682.0, + "step": 950 + }, + { + "epoch": 6.996309963099631, + "grad_norm": 0.7485303984947371, + "learning_rate": 2.9191517871944763e-06, + "loss": 0.2267, + "num_tokens": 105084014.0, + "step": 951 + }, + { + "epoch": 7.0, + "grad_norm": 0.7485303984947371, + "learning_rate": 2.910311994148255e-06, + "loss": 0.1983, + "num_tokens": 105124687.0, + "step": 952 + }, + { + "epoch": 7.007380073800738, + "grad_norm": 1.327378191458176, + "learning_rate": 2.901487117628025e-06, + "loss": 0.1776, + "num_tokens": 105230003.0, + "step": 953 + }, + { + "epoch": 7.014760147601476, + "grad_norm": 0.7273680072948099, + "learning_rate": 2.892677208464811e-06, + "loss": 0.1327, + "num_tokens": 105323072.0, + "step": 954 + }, + { + "epoch": 7.022140221402214, + "grad_norm": 0.5306741304213602, + "learning_rate": 2.8838823174034314e-06, + "loss": 0.1744, + "num_tokens": 105461293.0, + "step": 955 + }, + { + "epoch": 7.029520295202952, + "grad_norm": 0.5746012125314613, + "learning_rate": 2.8751024951021954e-06, + "loss": 0.1332, + "num_tokens": 105593715.0, + "step": 956 + }, + { + "epoch": 7.03690036900369, + "grad_norm": 0.7356406852351963, + "learning_rate": 2.866337792132618e-06, + "loss": 0.1638, + "num_tokens": 105690443.0, + "step": 957 + }, + { + "epoch": 7.044280442804428, + "grad_norm": 0.6708094431234726, + "learning_rate": 2.85758825897913e-06, + "loss": 0.1411, + "num_tokens": 105804749.0, + "step": 958 + }, + { + "epoch": 7.051660516605166, + "grad_norm": 0.8330900045498715, + "learning_rate": 2.8488539460387822e-06, + "loss": 0.1561, + "num_tokens": 105918245.0, + "step": 959 + }, + { + "epoch": 7.059040590405904, + "grad_norm": 0.882272731175345, + "learning_rate": 2.8401349036209563e-06, + "loss": 0.1712, + "num_tokens": 106001812.0, + "step": 960 + }, + { + "epoch": 7.0664206642066425, + "grad_norm": 1.0611329952976811, + "learning_rate": 2.8314311819470786e-06, + "loss": 0.1288, + "num_tokens": 106096337.0, + "step": 961 + }, + { + "epoch": 7.07380073800738, + "grad_norm": 0.8757551848240493, + "learning_rate": 2.822742831150328e-06, + "loss": 0.1456, + "num_tokens": 106227709.0, + "step": 962 + }, + { + "epoch": 7.081180811808118, + "grad_norm": 0.9051974031387492, + "learning_rate": 2.814069901275345e-06, + "loss": 0.6104, + "num_tokens": 106371387.0, + "step": 963 + }, + { + "epoch": 7.088560885608856, + "grad_norm": 0.747810427910208, + "learning_rate": 2.8054124422779495e-06, + "loss": 0.1615, + "num_tokens": 106480982.0, + "step": 964 + }, + { + "epoch": 7.095940959409594, + "grad_norm": 0.7570107836205541, + "learning_rate": 2.7967705040248467e-06, + "loss": 0.1543, + "num_tokens": 106591191.0, + "step": 965 + }, + { + "epoch": 7.1033210332103325, + "grad_norm": 0.7597118970878524, + "learning_rate": 2.788144136293347e-06, + "loss": 0.133, + "num_tokens": 106683305.0, + "step": 966 + }, + { + "epoch": 7.11070110701107, + "grad_norm": 0.8422954888652188, + "learning_rate": 2.779533388771069e-06, + "loss": 0.1766, + "num_tokens": 106772494.0, + "step": 967 + }, + { + "epoch": 7.118081180811808, + "grad_norm": 0.7074748344221055, + "learning_rate": 2.7709383110556663e-06, + "loss": 0.1295, + "num_tokens": 106898534.0, + "step": 968 + }, + { + "epoch": 7.125461254612546, + "grad_norm": 0.5549166268641669, + "learning_rate": 2.7623589526545292e-06, + "loss": 0.1397, + "num_tokens": 107027631.0, + "step": 969 + }, + { + "epoch": 7.132841328413284, + "grad_norm": 0.696716651502236, + "learning_rate": 2.753795362984507e-06, + "loss": 0.1261, + "num_tokens": 107120375.0, + "step": 970 + }, + { + "epoch": 7.1402214022140225, + "grad_norm": 0.629510010033181, + "learning_rate": 2.745247591371623e-06, + "loss": 0.1166, + "num_tokens": 107222605.0, + "step": 971 + }, + { + "epoch": 7.14760147601476, + "grad_norm": 0.6636518088732871, + "learning_rate": 2.736715687050787e-06, + "loss": 0.1419, + "num_tokens": 107314539.0, + "step": 972 + }, + { + "epoch": 7.154981549815498, + "grad_norm": 0.7553635445094093, + "learning_rate": 2.7281996991655147e-06, + "loss": 0.1519, + "num_tokens": 107400637.0, + "step": 973 + }, + { + "epoch": 7.162361623616236, + "grad_norm": 0.6672124661795636, + "learning_rate": 2.719699676767641e-06, + "loss": 0.1628, + "num_tokens": 107507287.0, + "step": 974 + }, + { + "epoch": 7.169741697416974, + "grad_norm": 0.7514636968559429, + "learning_rate": 2.711215668817046e-06, + "loss": 0.1542, + "num_tokens": 107610980.0, + "step": 975 + }, + { + "epoch": 7.177121771217712, + "grad_norm": 0.7856635815707779, + "learning_rate": 2.7027477241813628e-06, + "loss": 0.1407, + "num_tokens": 107711542.0, + "step": 976 + }, + { + "epoch": 7.18450184501845, + "grad_norm": 0.6777094530338031, + "learning_rate": 2.6942958916356997e-06, + "loss": 0.1179, + "num_tokens": 107809906.0, + "step": 977 + }, + { + "epoch": 7.191881918819188, + "grad_norm": 0.6778942587881792, + "learning_rate": 2.685860219862362e-06, + "loss": 0.1601, + "num_tokens": 107940488.0, + "step": 978 + }, + { + "epoch": 7.199261992619927, + "grad_norm": 0.7379615357781537, + "learning_rate": 2.6774407574505677e-06, + "loss": 0.1212, + "num_tokens": 108041747.0, + "step": 979 + }, + { + "epoch": 7.206642066420664, + "grad_norm": 0.6808474625734778, + "learning_rate": 2.669037552896172e-06, + "loss": 0.1769, + "num_tokens": 108164572.0, + "step": 980 + }, + { + "epoch": 7.214022140221402, + "grad_norm": 0.7613234200216101, + "learning_rate": 2.6606506546013813e-06, + "loss": 0.1268, + "num_tokens": 108272656.0, + "step": 981 + }, + { + "epoch": 7.22140221402214, + "grad_norm": 0.6673755565328315, + "learning_rate": 2.65228011087448e-06, + "loss": 0.1348, + "num_tokens": 108363753.0, + "step": 982 + }, + { + "epoch": 7.228782287822878, + "grad_norm": 0.7883723389050911, + "learning_rate": 2.643925969929555e-06, + "loss": 0.1738, + "num_tokens": 108469274.0, + "step": 983 + }, + { + "epoch": 7.236162361623617, + "grad_norm": 0.7164678591014224, + "learning_rate": 2.635588279886207e-06, + "loss": 0.12, + "num_tokens": 108562632.0, + "step": 984 + }, + { + "epoch": 7.243542435424354, + "grad_norm": 0.7832327287230083, + "learning_rate": 2.6272670887692832e-06, + "loss": 0.1707, + "num_tokens": 108680167.0, + "step": 985 + }, + { + "epoch": 7.250922509225092, + "grad_norm": 0.5796419139653467, + "learning_rate": 2.618962444508599e-06, + "loss": 0.642, + "num_tokens": 108850176.0, + "step": 986 + }, + { + "epoch": 7.25830258302583, + "grad_norm": 0.6134482139402445, + "learning_rate": 2.6106743949386585e-06, + "loss": 0.1114, + "num_tokens": 108946874.0, + "step": 987 + }, + { + "epoch": 7.265682656826568, + "grad_norm": 0.630596461450639, + "learning_rate": 2.6024029877983804e-06, + "loss": 0.1575, + "num_tokens": 109090243.0, + "step": 988 + }, + { + "epoch": 7.273062730627307, + "grad_norm": 0.7023239819899377, + "learning_rate": 2.594148270730823e-06, + "loss": 0.143, + "num_tokens": 109218260.0, + "step": 989 + }, + { + "epoch": 7.280442804428044, + "grad_norm": 0.7603245714545737, + "learning_rate": 2.5859102912829127e-06, + "loss": 0.1351, + "num_tokens": 109328685.0, + "step": 990 + }, + { + "epoch": 7.287822878228782, + "grad_norm": 0.6309167253692112, + "learning_rate": 2.577689096905166e-06, + "loss": 0.1456, + "num_tokens": 109445171.0, + "step": 991 + }, + { + "epoch": 7.29520295202952, + "grad_norm": 0.6396034312241526, + "learning_rate": 2.5694847349514175e-06, + "loss": 0.1354, + "num_tokens": 109545079.0, + "step": 992 + }, + { + "epoch": 7.302583025830258, + "grad_norm": 0.7331489811512829, + "learning_rate": 2.56129725267855e-06, + "loss": 0.1549, + "num_tokens": 109679753.0, + "step": 993 + }, + { + "epoch": 7.3099630996309966, + "grad_norm": 0.7132913901541111, + "learning_rate": 2.5531266972462176e-06, + "loss": 0.2337, + "num_tokens": 109845544.0, + "step": 994 + }, + { + "epoch": 7.317343173431734, + "grad_norm": 0.672421454522611, + "learning_rate": 2.544973115716577e-06, + "loss": 0.1603, + "num_tokens": 109978598.0, + "step": 995 + }, + { + "epoch": 7.324723247232472, + "grad_norm": 0.7517681603828581, + "learning_rate": 2.5368365550540154e-06, + "loss": 0.135, + "num_tokens": 110068508.0, + "step": 996 + }, + { + "epoch": 7.332103321033211, + "grad_norm": 0.6560651437258584, + "learning_rate": 2.52871706212488e-06, + "loss": 0.1427, + "num_tokens": 110140082.0, + "step": 997 + }, + { + "epoch": 7.339483394833948, + "grad_norm": 0.7577820555330438, + "learning_rate": 2.5206146836972102e-06, + "loss": 0.1477, + "num_tokens": 110256850.0, + "step": 998 + }, + { + "epoch": 7.3468634686346865, + "grad_norm": 0.6606555518773797, + "learning_rate": 2.5125294664404635e-06, + "loss": 0.1314, + "num_tokens": 110356826.0, + "step": 999 + }, + { + "epoch": 7.354243542435424, + "grad_norm": 0.7813586653528403, + "learning_rate": 2.504461456925251e-06, + "loss": 0.1547, + "num_tokens": 110458059.0, + "step": 1000 + }, + { + "epoch": 7.361623616236162, + "grad_norm": 0.7056250932214523, + "learning_rate": 2.4964107016230703e-06, + "loss": 0.2277, + "num_tokens": 110588624.0, + "step": 1001 + }, + { + "epoch": 7.369003690036901, + "grad_norm": 0.7183263827162158, + "learning_rate": 2.488377246906031e-06, + "loss": 0.1053, + "num_tokens": 110675900.0, + "step": 1002 + }, + { + "epoch": 7.376383763837638, + "grad_norm": 0.6558940456536225, + "learning_rate": 2.4803611390465925e-06, + "loss": 0.1339, + "num_tokens": 110782752.0, + "step": 1003 + }, + { + "epoch": 7.3837638376383765, + "grad_norm": 0.6936986894485098, + "learning_rate": 2.4723624242173007e-06, + "loss": 0.1548, + "num_tokens": 110899453.0, + "step": 1004 + }, + { + "epoch": 7.391143911439114, + "grad_norm": 0.7001137198073798, + "learning_rate": 2.4643811484905145e-06, + "loss": 0.147, + "num_tokens": 111051369.0, + "step": 1005 + }, + { + "epoch": 7.398523985239852, + "grad_norm": 0.7131543193897704, + "learning_rate": 2.4564173578381447e-06, + "loss": 0.1229, + "num_tokens": 111147927.0, + "step": 1006 + }, + { + "epoch": 7.405904059040591, + "grad_norm": 0.7784078969716604, + "learning_rate": 2.4484710981313883e-06, + "loss": 0.1441, + "num_tokens": 111245874.0, + "step": 1007 + }, + { + "epoch": 7.413284132841328, + "grad_norm": 0.6840839356194213, + "learning_rate": 2.4405424151404664e-06, + "loss": 0.4253, + "num_tokens": 111385282.0, + "step": 1008 + }, + { + "epoch": 7.4206642066420665, + "grad_norm": 0.8005602878365277, + "learning_rate": 2.432631354534355e-06, + "loss": 0.1455, + "num_tokens": 111508532.0, + "step": 1009 + }, + { + "epoch": 7.428044280442805, + "grad_norm": 0.7005436952515709, + "learning_rate": 2.424737961880531e-06, + "loss": 0.1318, + "num_tokens": 111603739.0, + "step": 1010 + }, + { + "epoch": 7.435424354243542, + "grad_norm": 0.656941062893856, + "learning_rate": 2.4168622826447016e-06, + "loss": 0.1288, + "num_tokens": 111719579.0, + "step": 1011 + }, + { + "epoch": 7.442804428044281, + "grad_norm": 0.7944245640607924, + "learning_rate": 2.4090043621905435e-06, + "loss": 0.1663, + "num_tokens": 111825869.0, + "step": 1012 + }, + { + "epoch": 7.450184501845018, + "grad_norm": 0.880039654114411, + "learning_rate": 2.401164245779447e-06, + "loss": 0.1286, + "num_tokens": 111920941.0, + "step": 1013 + }, + { + "epoch": 7.4575645756457565, + "grad_norm": 0.6486686170838069, + "learning_rate": 2.3933419785702476e-06, + "loss": 0.146, + "num_tokens": 112044133.0, + "step": 1014 + }, + { + "epoch": 7.464944649446495, + "grad_norm": 0.7008456334929012, + "learning_rate": 2.385537605618974e-06, + "loss": 0.1562, + "num_tokens": 112147601.0, + "step": 1015 + }, + { + "epoch": 7.472324723247232, + "grad_norm": 0.8614114433536172, + "learning_rate": 2.377751171878581e-06, + "loss": 0.1754, + "num_tokens": 112286299.0, + "step": 1016 + }, + { + "epoch": 7.479704797047971, + "grad_norm": 0.6006994620540834, + "learning_rate": 2.369982722198697e-06, + "loss": 0.135, + "num_tokens": 112397315.0, + "step": 1017 + }, + { + "epoch": 7.487084870848708, + "grad_norm": 0.6421371758364183, + "learning_rate": 2.3622323013253595e-06, + "loss": 0.1827, + "num_tokens": 112551274.0, + "step": 1018 + }, + { + "epoch": 7.4944649446494465, + "grad_norm": 0.6116215442679512, + "learning_rate": 2.354499953900765e-06, + "loss": 0.1276, + "num_tokens": 112692453.0, + "step": 1019 + }, + { + "epoch": 7.501845018450185, + "grad_norm": 0.6575972998841045, + "learning_rate": 2.346785724463002e-06, + "loss": 0.1272, + "num_tokens": 112781826.0, + "step": 1020 + }, + { + "epoch": 7.509225092250922, + "grad_norm": 0.7461270099639635, + "learning_rate": 2.339089657445807e-06, + "loss": 0.1481, + "num_tokens": 112886040.0, + "step": 1021 + }, + { + "epoch": 7.516605166051661, + "grad_norm": 0.6313586444022081, + "learning_rate": 2.3314117971782947e-06, + "loss": 0.1456, + "num_tokens": 113017230.0, + "step": 1022 + }, + { + "epoch": 7.523985239852399, + "grad_norm": 0.7039813774470369, + "learning_rate": 2.3237521878847128e-06, + "loss": 0.1586, + "num_tokens": 113126819.0, + "step": 1023 + }, + { + "epoch": 7.531365313653136, + "grad_norm": 0.7964284587475011, + "learning_rate": 2.316110873684183e-06, + "loss": 0.1183, + "num_tokens": 113225040.0, + "step": 1024 + }, + { + "epoch": 7.538745387453875, + "grad_norm": 0.7294701258254773, + "learning_rate": 2.308487898590448e-06, + "loss": 0.1444, + "num_tokens": 113327340.0, + "step": 1025 + }, + { + "epoch": 7.546125461254612, + "grad_norm": 0.877832148971536, + "learning_rate": 2.3008833065116173e-06, + "loss": 0.1451, + "num_tokens": 113406386.0, + "step": 1026 + }, + { + "epoch": 7.553505535055351, + "grad_norm": 0.8266989648256501, + "learning_rate": 2.2932971412499173e-06, + "loss": 0.1622, + "num_tokens": 113511600.0, + "step": 1027 + }, + { + "epoch": 7.560885608856088, + "grad_norm": 0.7095373348730436, + "learning_rate": 2.285729446501434e-06, + "loss": 0.1556, + "num_tokens": 113654802.0, + "step": 1028 + }, + { + "epoch": 7.568265682656826, + "grad_norm": 0.577958730374615, + "learning_rate": 2.2781802658558636e-06, + "loss": 0.117, + "num_tokens": 113770358.0, + "step": 1029 + }, + { + "epoch": 7.575645756457565, + "grad_norm": 0.8026599393917855, + "learning_rate": 2.2706496427962633e-06, + "loss": 0.1556, + "num_tokens": 113842420.0, + "step": 1030 + }, + { + "epoch": 7.583025830258302, + "grad_norm": 0.8670380842470107, + "learning_rate": 2.263137620698797e-06, + "loss": 0.1503, + "num_tokens": 113942402.0, + "step": 1031 + }, + { + "epoch": 7.590405904059041, + "grad_norm": 0.6459757734058699, + "learning_rate": 2.2556442428324896e-06, + "loss": 0.112, + "num_tokens": 114049763.0, + "step": 1032 + }, + { + "epoch": 7.597785977859779, + "grad_norm": 0.6198832800578641, + "learning_rate": 2.2481695523589747e-06, + "loss": 0.1508, + "num_tokens": 114193985.0, + "step": 1033 + }, + { + "epoch": 7.605166051660516, + "grad_norm": 0.6464652228137838, + "learning_rate": 2.240713592332248e-06, + "loss": 0.1414, + "num_tokens": 114287318.0, + "step": 1034 + }, + { + "epoch": 7.612546125461255, + "grad_norm": 0.6911239323885385, + "learning_rate": 2.2332764056984156e-06, + "loss": 0.3724, + "num_tokens": 114383351.0, + "step": 1035 + }, + { + "epoch": 7.619926199261993, + "grad_norm": 0.7110968352750003, + "learning_rate": 2.2258580352954558e-06, + "loss": 0.1471, + "num_tokens": 114482588.0, + "step": 1036 + }, + { + "epoch": 7.627306273062731, + "grad_norm": 0.6758906829132127, + "learning_rate": 2.2184585238529584e-06, + "loss": 0.1284, + "num_tokens": 114581882.0, + "step": 1037 + }, + { + "epoch": 7.634686346863469, + "grad_norm": 0.6890951539697263, + "learning_rate": 2.2110779139918893e-06, + "loss": 0.172, + "num_tokens": 114687278.0, + "step": 1038 + }, + { + "epoch": 7.642066420664206, + "grad_norm": 0.7597164898491231, + "learning_rate": 2.2037162482243445e-06, + "loss": 0.1208, + "num_tokens": 114786568.0, + "step": 1039 + }, + { + "epoch": 7.649446494464945, + "grad_norm": 0.7074984062973345, + "learning_rate": 2.1963735689532993e-06, + "loss": 0.1299, + "num_tokens": 114870784.0, + "step": 1040 + }, + { + "epoch": 7.656826568265682, + "grad_norm": 0.692609386390428, + "learning_rate": 2.189049918472368e-06, + "loss": 0.182, + "num_tokens": 115000403.0, + "step": 1041 + }, + { + "epoch": 7.6642066420664205, + "grad_norm": 0.825979291195909, + "learning_rate": 2.1817453389655597e-06, + "loss": 0.1265, + "num_tokens": 115085653.0, + "step": 1042 + }, + { + "epoch": 7.671586715867159, + "grad_norm": 0.591110702939691, + "learning_rate": 2.174459872507035e-06, + "loss": 0.1313, + "num_tokens": 115199605.0, + "step": 1043 + }, + { + "epoch": 7.678966789667896, + "grad_norm": 0.6922804180876169, + "learning_rate": 2.167193561060863e-06, + "loss": 0.1422, + "num_tokens": 115316924.0, + "step": 1044 + }, + { + "epoch": 7.686346863468635, + "grad_norm": 0.8015568869876212, + "learning_rate": 2.1599464464807856e-06, + "loss": 0.1639, + "num_tokens": 115424183.0, + "step": 1045 + }, + { + "epoch": 7.693726937269373, + "grad_norm": 0.795673213014524, + "learning_rate": 2.1527185705099646e-06, + "loss": 0.1465, + "num_tokens": 115526922.0, + "step": 1046 + }, + { + "epoch": 7.7011070110701105, + "grad_norm": 0.6167523878689091, + "learning_rate": 2.145509974780752e-06, + "loss": 0.1195, + "num_tokens": 115618628.0, + "step": 1047 + }, + { + "epoch": 7.708487084870849, + "grad_norm": 0.6940445233539998, + "learning_rate": 2.1383207008144447e-06, + "loss": 0.1225, + "num_tokens": 115709416.0, + "step": 1048 + }, + { + "epoch": 7.715867158671586, + "grad_norm": 0.8780760736913564, + "learning_rate": 2.131150790021047e-06, + "loss": 0.2002, + "num_tokens": 115808392.0, + "step": 1049 + }, + { + "epoch": 7.723247232472325, + "grad_norm": 0.6517738449681595, + "learning_rate": 2.124000283699033e-06, + "loss": 0.1405, + "num_tokens": 115925056.0, + "step": 1050 + }, + { + "epoch": 7.730627306273063, + "grad_norm": 0.7288093305351533, + "learning_rate": 2.1168692230351056e-06, + "loss": 0.1343, + "num_tokens": 116026918.0, + "step": 1051 + }, + { + "epoch": 7.7380073800738005, + "grad_norm": 0.7310571676229276, + "learning_rate": 2.1097576491039616e-06, + "loss": 0.159, + "num_tokens": 116140965.0, + "step": 1052 + }, + { + "epoch": 7.745387453874539, + "grad_norm": 0.7922586445064035, + "learning_rate": 2.1026656028680577e-06, + "loss": 0.187, + "num_tokens": 116259358.0, + "step": 1053 + }, + { + "epoch": 7.752767527675276, + "grad_norm": 0.8076731960056938, + "learning_rate": 2.0955931251773694e-06, + "loss": 0.1511, + "num_tokens": 116366422.0, + "step": 1054 + }, + { + "epoch": 7.760147601476015, + "grad_norm": 0.7714587035390015, + "learning_rate": 2.088540256769157e-06, + "loss": 0.1583, + "num_tokens": 116469692.0, + "step": 1055 + }, + { + "epoch": 7.767527675276753, + "grad_norm": 0.8724198754759499, + "learning_rate": 2.0815070382677325e-06, + "loss": 0.1346, + "num_tokens": 116558086.0, + "step": 1056 + }, + { + "epoch": 7.7749077490774905, + "grad_norm": 0.773017521357972, + "learning_rate": 2.0744935101842277e-06, + "loss": 0.1553, + "num_tokens": 116675823.0, + "step": 1057 + }, + { + "epoch": 7.782287822878229, + "grad_norm": 0.7258629619643803, + "learning_rate": 2.067499712916355e-06, + "loss": 0.1096, + "num_tokens": 116763713.0, + "step": 1058 + }, + { + "epoch": 7.789667896678967, + "grad_norm": 0.7194357884958432, + "learning_rate": 2.060525686748179e-06, + "loss": 0.1348, + "num_tokens": 116851558.0, + "step": 1059 + }, + { + "epoch": 7.797047970479705, + "grad_norm": 0.6692815808770822, + "learning_rate": 2.0535714718498824e-06, + "loss": 0.1496, + "num_tokens": 116968472.0, + "step": 1060 + }, + { + "epoch": 7.804428044280443, + "grad_norm": 0.7919788891125946, + "learning_rate": 2.0466371082775362e-06, + "loss": 0.1468, + "num_tokens": 117069879.0, + "step": 1061 + }, + { + "epoch": 7.8118081180811805, + "grad_norm": 0.7147707246771277, + "learning_rate": 2.0397226359728705e-06, + "loss": 0.1506, + "num_tokens": 117210319.0, + "step": 1062 + }, + { + "epoch": 7.819188191881919, + "grad_norm": 0.7144100080912492, + "learning_rate": 2.03282809476304e-06, + "loss": 0.1458, + "num_tokens": 117312938.0, + "step": 1063 + }, + { + "epoch": 7.826568265682657, + "grad_norm": 0.8295047610651002, + "learning_rate": 2.025953524360396e-06, + "loss": 0.1635, + "num_tokens": 117439801.0, + "step": 1064 + }, + { + "epoch": 7.833948339483395, + "grad_norm": 0.7163457526537156, + "learning_rate": 2.0190989643622615e-06, + "loss": 0.125, + "num_tokens": 117522839.0, + "step": 1065 + }, + { + "epoch": 7.841328413284133, + "grad_norm": 0.7292120907586226, + "learning_rate": 2.012264454250697e-06, + "loss": 0.1251, + "num_tokens": 117616016.0, + "step": 1066 + }, + { + "epoch": 7.8487084870848705, + "grad_norm": 0.6896242843049234, + "learning_rate": 2.0054500333922783e-06, + "loss": 0.1635, + "num_tokens": 117723281.0, + "step": 1067 + }, + { + "epoch": 7.856088560885609, + "grad_norm": 0.760110462390299, + "learning_rate": 1.998655741037867e-06, + "loss": 0.1686, + "num_tokens": 117868155.0, + "step": 1068 + }, + { + "epoch": 7.863468634686347, + "grad_norm": 0.6419790312136794, + "learning_rate": 1.9918816163223847e-06, + "loss": 0.1311, + "num_tokens": 117969002.0, + "step": 1069 + }, + { + "epoch": 7.870848708487085, + "grad_norm": 0.7127690713983089, + "learning_rate": 1.985127698264589e-06, + "loss": 0.1232, + "num_tokens": 118080125.0, + "step": 1070 + }, + { + "epoch": 7.878228782287823, + "grad_norm": 0.6059823842723294, + "learning_rate": 1.9783940257668475e-06, + "loss": 0.1805, + "num_tokens": 118205028.0, + "step": 1071 + }, + { + "epoch": 7.885608856088561, + "grad_norm": 0.7724954346608415, + "learning_rate": 1.971680637614915e-06, + "loss": 0.2085, + "num_tokens": 118327856.0, + "step": 1072 + }, + { + "epoch": 7.892988929889299, + "grad_norm": 0.6360178773299939, + "learning_rate": 1.964987572477706e-06, + "loss": 0.1432, + "num_tokens": 118434556.0, + "step": 1073 + }, + { + "epoch": 7.900369003690037, + "grad_norm": 0.8040243280272408, + "learning_rate": 1.9583148689070762e-06, + "loss": 0.1277, + "num_tokens": 118536005.0, + "step": 1074 + }, + { + "epoch": 7.907749077490775, + "grad_norm": 0.638107685289796, + "learning_rate": 1.9516625653376027e-06, + "loss": 0.1282, + "num_tokens": 118639759.0, + "step": 1075 + }, + { + "epoch": 7.915129151291513, + "grad_norm": 0.6035748385223451, + "learning_rate": 1.9450307000863546e-06, + "loss": 0.1669, + "num_tokens": 118820768.0, + "step": 1076 + }, + { + "epoch": 7.922509225092251, + "grad_norm": 0.6903668767059657, + "learning_rate": 1.9384193113526793e-06, + "loss": 0.1884, + "num_tokens": 118936693.0, + "step": 1077 + }, + { + "epoch": 7.929889298892989, + "grad_norm": 0.7353679463674523, + "learning_rate": 1.9318284372179784e-06, + "loss": 0.1454, + "num_tokens": 119049510.0, + "step": 1078 + }, + { + "epoch": 7.937269372693727, + "grad_norm": 0.8100224185456285, + "learning_rate": 1.925258115645493e-06, + "loss": 0.166, + "num_tokens": 119151079.0, + "step": 1079 + }, + { + "epoch": 7.944649446494465, + "grad_norm": 0.682480326618363, + "learning_rate": 1.9187083844800795e-06, + "loss": 0.161, + "num_tokens": 119275485.0, + "step": 1080 + }, + { + "epoch": 7.952029520295203, + "grad_norm": 0.7756070561036585, + "learning_rate": 1.9121792814479947e-06, + "loss": 0.1324, + "num_tokens": 119384383.0, + "step": 1081 + }, + { + "epoch": 7.959409594095941, + "grad_norm": 0.6820620129417011, + "learning_rate": 1.9056708441566784e-06, + "loss": 0.1412, + "num_tokens": 119486138.0, + "step": 1082 + }, + { + "epoch": 7.966789667896679, + "grad_norm": 0.6962863439355318, + "learning_rate": 1.8991831100945351e-06, + "loss": 0.1676, + "num_tokens": 119570636.0, + "step": 1083 + }, + { + "epoch": 7.974169741697417, + "grad_norm": 1.083753376552152, + "learning_rate": 1.8927161166307212e-06, + "loss": 0.1658, + "num_tokens": 119669594.0, + "step": 1084 + }, + { + "epoch": 7.9815498154981555, + "grad_norm": 0.6265816477912981, + "learning_rate": 1.8862699010149269e-06, + "loss": 0.1916, + "num_tokens": 119771467.0, + "step": 1085 + }, + { + "epoch": 7.988929889298893, + "grad_norm": 0.8302480419180455, + "learning_rate": 1.8798445003771622e-06, + "loss": 0.4692, + "num_tokens": 119926781.0, + "step": 1086 + }, + { + "epoch": 7.996309963099631, + "grad_norm": 0.6010145896400855, + "learning_rate": 1.8734399517275434e-06, + "loss": 0.1701, + "num_tokens": 120069416.0, + "step": 1087 + }, + { + "epoch": 8.0, + "grad_norm": 1.124987574983492, + "learning_rate": 1.867056291956082e-06, + "loss": 0.182, + "num_tokens": 120150035.0, + "step": 1088 + }, + { + "epoch": 8.007380073800737, + "grad_norm": 0.607427419879896, + "learning_rate": 1.8606935578324687e-06, + "loss": 0.1024, + "num_tokens": 120248370.0, + "step": 1089 + }, + { + "epoch": 8.014760147601477, + "grad_norm": 0.6535965619098806, + "learning_rate": 1.8543517860058619e-06, + "loss": 0.1196, + "num_tokens": 120340917.0, + "step": 1090 + }, + { + "epoch": 8.022140221402214, + "grad_norm": 0.6294450411059588, + "learning_rate": 1.848031013004678e-06, + "loss": 0.1035, + "num_tokens": 120426701.0, + "step": 1091 + }, + { + "epoch": 8.029520295202952, + "grad_norm": 0.5688460646507363, + "learning_rate": 1.8417312752363844e-06, + "loss": 0.138, + "num_tokens": 120588615.0, + "step": 1092 + }, + { + "epoch": 8.03690036900369, + "grad_norm": 0.5995824249286849, + "learning_rate": 1.8354526089872826e-06, + "loss": 0.1419, + "num_tokens": 120698924.0, + "step": 1093 + }, + { + "epoch": 8.044280442804428, + "grad_norm": 0.6075907318827694, + "learning_rate": 1.8291950504223033e-06, + "loss": 0.1039, + "num_tokens": 120811933.0, + "step": 1094 + }, + { + "epoch": 8.051660516605166, + "grad_norm": 0.6933032668859173, + "learning_rate": 1.8229586355847978e-06, + "loss": 0.1124, + "num_tokens": 120922751.0, + "step": 1095 + }, + { + "epoch": 8.059040590405903, + "grad_norm": 0.6970315340175753, + "learning_rate": 1.816743400396329e-06, + "loss": 0.0972, + "num_tokens": 121034194.0, + "step": 1096 + }, + { + "epoch": 8.066420664206642, + "grad_norm": 0.7617170293501341, + "learning_rate": 1.81054938065647e-06, + "loss": 0.1269, + "num_tokens": 121140277.0, + "step": 1097 + }, + { + "epoch": 8.07380073800738, + "grad_norm": 0.7173193240486126, + "learning_rate": 1.804376612042589e-06, + "loss": 0.0877, + "num_tokens": 121213531.0, + "step": 1098 + }, + { + "epoch": 8.081180811808117, + "grad_norm": 0.7090653918012082, + "learning_rate": 1.7982251301096498e-06, + "loss": 0.098, + "num_tokens": 121336819.0, + "step": 1099 + }, + { + "epoch": 8.088560885608857, + "grad_norm": 0.5605275633072052, + "learning_rate": 1.7920949702900058e-06, + "loss": 0.0748, + "num_tokens": 121460540.0, + "step": 1100 + }, + { + "epoch": 8.095940959409594, + "grad_norm": 0.9019987403479388, + "learning_rate": 1.785986167893195e-06, + "loss": 0.116, + "num_tokens": 121546617.0, + "step": 1101 + }, + { + "epoch": 8.103321033210332, + "grad_norm": 0.8064867967283718, + "learning_rate": 1.7798987581057386e-06, + "loss": 0.1114, + "num_tokens": 121628635.0, + "step": 1102 + }, + { + "epoch": 8.11070110701107, + "grad_norm": 0.5272843009986005, + "learning_rate": 1.7738327759909354e-06, + "loss": 0.0954, + "num_tokens": 121757053.0, + "step": 1103 + }, + { + "epoch": 8.118081180811808, + "grad_norm": 0.6908802325389886, + "learning_rate": 1.7677882564886618e-06, + "loss": 0.1085, + "num_tokens": 121851076.0, + "step": 1104 + }, + { + "epoch": 8.125461254612546, + "grad_norm": 0.6822375336604328, + "learning_rate": 1.761765234415172e-06, + "loss": 0.1166, + "num_tokens": 121955294.0, + "step": 1105 + }, + { + "epoch": 8.132841328413285, + "grad_norm": 0.48922879377109746, + "learning_rate": 1.7557637444628935e-06, + "loss": 0.0955, + "num_tokens": 122086501.0, + "step": 1106 + }, + { + "epoch": 8.140221402214022, + "grad_norm": 0.6797930896552248, + "learning_rate": 1.74978382120023e-06, + "loss": 0.1186, + "num_tokens": 122163798.0, + "step": 1107 + }, + { + "epoch": 8.14760147601476, + "grad_norm": 0.6264128859132683, + "learning_rate": 1.743825499071362e-06, + "loss": 0.1076, + "num_tokens": 122277678.0, + "step": 1108 + }, + { + "epoch": 8.154981549815497, + "grad_norm": 0.7398057787194479, + "learning_rate": 1.7378888123960474e-06, + "loss": 0.1053, + "num_tokens": 122365248.0, + "step": 1109 + }, + { + "epoch": 8.162361623616237, + "grad_norm": 0.6466976054306609, + "learning_rate": 1.7319737953694267e-06, + "loss": 0.1067, + "num_tokens": 122451453.0, + "step": 1110 + }, + { + "epoch": 8.169741697416974, + "grad_norm": 0.5520221171044929, + "learning_rate": 1.7260804820618207e-06, + "loss": 0.1034, + "num_tokens": 122543362.0, + "step": 1111 + }, + { + "epoch": 8.177121771217712, + "grad_norm": 0.7733568794396356, + "learning_rate": 1.72020890641854e-06, + "loss": 0.1072, + "num_tokens": 122647562.0, + "step": 1112 + }, + { + "epoch": 8.18450184501845, + "grad_norm": 0.5953487947405357, + "learning_rate": 1.7143591022596846e-06, + "loss": 0.1116, + "num_tokens": 122771485.0, + "step": 1113 + }, + { + "epoch": 8.191881918819188, + "grad_norm": 0.5452828548484447, + "learning_rate": 1.708531103279954e-06, + "loss": 0.0781, + "num_tokens": 122880126.0, + "step": 1114 + }, + { + "epoch": 8.199261992619926, + "grad_norm": 0.7244406131025011, + "learning_rate": 1.7027249430484496e-06, + "loss": 0.1455, + "num_tokens": 123011714.0, + "step": 1115 + }, + { + "epoch": 8.206642066420665, + "grad_norm": 0.6286165801135194, + "learning_rate": 1.6969406550084805e-06, + "loss": 0.1054, + "num_tokens": 123119734.0, + "step": 1116 + }, + { + "epoch": 8.214022140221402, + "grad_norm": 0.702033524190885, + "learning_rate": 1.691178272477375e-06, + "loss": 0.419, + "num_tokens": 123240721.0, + "step": 1117 + }, + { + "epoch": 8.22140221402214, + "grad_norm": 0.7033012508596181, + "learning_rate": 1.6854378286462844e-06, + "loss": 0.1303, + "num_tokens": 123377125.0, + "step": 1118 + }, + { + "epoch": 8.228782287822877, + "grad_norm": 0.6982785557123685, + "learning_rate": 1.6797193565799955e-06, + "loss": 0.1361, + "num_tokens": 123493175.0, + "step": 1119 + }, + { + "epoch": 8.236162361623617, + "grad_norm": 0.5384384152900734, + "learning_rate": 1.674022889216737e-06, + "loss": 0.4249, + "num_tokens": 123635160.0, + "step": 1120 + }, + { + "epoch": 8.243542435424354, + "grad_norm": 0.5900790975693391, + "learning_rate": 1.668348459367992e-06, + "loss": 0.0914, + "num_tokens": 123741979.0, + "step": 1121 + }, + { + "epoch": 8.250922509225092, + "grad_norm": 0.6652651245218247, + "learning_rate": 1.6626960997183074e-06, + "loss": 0.1206, + "num_tokens": 123852328.0, + "step": 1122 + }, + { + "epoch": 8.25830258302583, + "grad_norm": 0.5755410301347661, + "learning_rate": 1.6570658428251075e-06, + "loss": 0.1731, + "num_tokens": 124007659.0, + "step": 1123 + }, + { + "epoch": 8.265682656826568, + "grad_norm": 0.6436584539623239, + "learning_rate": 1.6514577211185046e-06, + "loss": 0.1111, + "num_tokens": 124089404.0, + "step": 1124 + }, + { + "epoch": 8.273062730627306, + "grad_norm": 0.6593239587931535, + "learning_rate": 1.6458717669011127e-06, + "loss": 0.1091, + "num_tokens": 124194927.0, + "step": 1125 + }, + { + "epoch": 8.280442804428045, + "grad_norm": 0.6676510191448691, + "learning_rate": 1.6403080123478631e-06, + "loss": 0.1269, + "num_tokens": 124305690.0, + "step": 1126 + }, + { + "epoch": 8.287822878228782, + "grad_norm": 0.6024940770564627, + "learning_rate": 1.6347664895058151e-06, + "loss": 0.1102, + "num_tokens": 124423653.0, + "step": 1127 + }, + { + "epoch": 8.29520295202952, + "grad_norm": 0.5704030544841321, + "learning_rate": 1.6292472302939776e-06, + "loss": 0.1236, + "num_tokens": 124535771.0, + "step": 1128 + }, + { + "epoch": 8.302583025830259, + "grad_norm": 0.6050519524784308, + "learning_rate": 1.6237502665031188e-06, + "loss": 0.1082, + "num_tokens": 124635652.0, + "step": 1129 + }, + { + "epoch": 8.309963099630997, + "grad_norm": 0.7476529098493471, + "learning_rate": 1.6182756297955865e-06, + "loss": 0.1363, + "num_tokens": 124768210.0, + "step": 1130 + }, + { + "epoch": 8.317343173431734, + "grad_norm": 0.6800745728650065, + "learning_rate": 1.6128233517051267e-06, + "loss": 0.098, + "num_tokens": 124873080.0, + "step": 1131 + }, + { + "epoch": 8.324723247232471, + "grad_norm": 0.6426860301671826, + "learning_rate": 1.6073934636366983e-06, + "loss": 0.1192, + "num_tokens": 124997839.0, + "step": 1132 + }, + { + "epoch": 8.33210332103321, + "grad_norm": 0.7184882712225652, + "learning_rate": 1.6019859968662956e-06, + "loss": 0.0982, + "num_tokens": 125083235.0, + "step": 1133 + }, + { + "epoch": 8.339483394833948, + "grad_norm": 0.5603381730436606, + "learning_rate": 1.5966009825407666e-06, + "loss": 0.0978, + "num_tokens": 125204535.0, + "step": 1134 + }, + { + "epoch": 8.346863468634686, + "grad_norm": 0.6530561101690034, + "learning_rate": 1.591238451677634e-06, + "loss": 0.1098, + "num_tokens": 125342218.0, + "step": 1135 + }, + { + "epoch": 8.354243542435425, + "grad_norm": 0.7113364303593663, + "learning_rate": 1.5858984351649157e-06, + "loss": 0.1487, + "num_tokens": 125455802.0, + "step": 1136 + }, + { + "epoch": 8.361623616236162, + "grad_norm": 0.6636500868696222, + "learning_rate": 1.5805809637609482e-06, + "loss": 0.1102, + "num_tokens": 125601963.0, + "step": 1137 + }, + { + "epoch": 8.3690036900369, + "grad_norm": 0.6106766856622697, + "learning_rate": 1.5752860680942094e-06, + "loss": 0.0908, + "num_tokens": 125685704.0, + "step": 1138 + }, + { + "epoch": 8.376383763837639, + "grad_norm": 0.6689267162919238, + "learning_rate": 1.5700137786631404e-06, + "loss": 0.0895, + "num_tokens": 125791716.0, + "step": 1139 + }, + { + "epoch": 8.383763837638377, + "grad_norm": 0.6209676350541423, + "learning_rate": 1.5647641258359724e-06, + "loss": 0.144, + "num_tokens": 125897551.0, + "step": 1140 + }, + { + "epoch": 8.391143911439114, + "grad_norm": 0.9232576930715772, + "learning_rate": 1.5595371398505498e-06, + "loss": 0.1248, + "num_tokens": 126006191.0, + "step": 1141 + }, + { + "epoch": 8.398523985239853, + "grad_norm": 0.5957913492192587, + "learning_rate": 1.5543328508141565e-06, + "loss": 0.1278, + "num_tokens": 126140879.0, + "step": 1142 + }, + { + "epoch": 8.40590405904059, + "grad_norm": 0.7026002448751827, + "learning_rate": 1.5491512887033427e-06, + "loss": 0.5765, + "num_tokens": 126251498.0, + "step": 1143 + }, + { + "epoch": 8.413284132841328, + "grad_norm": 0.6156038903064762, + "learning_rate": 1.5439924833637514e-06, + "loss": 0.0948, + "num_tokens": 126351415.0, + "step": 1144 + }, + { + "epoch": 8.420664206642066, + "grad_norm": 0.6115076221141882, + "learning_rate": 1.5388564645099486e-06, + "loss": 0.1236, + "num_tokens": 126493084.0, + "step": 1145 + }, + { + "epoch": 8.428044280442805, + "grad_norm": 0.519714845928742, + "learning_rate": 1.533743261725251e-06, + "loss": 0.1313, + "num_tokens": 126605851.0, + "step": 1146 + }, + { + "epoch": 8.435424354243542, + "grad_norm": 0.6145661780904482, + "learning_rate": 1.528652904461555e-06, + "loss": 0.0969, + "num_tokens": 126703227.0, + "step": 1147 + }, + { + "epoch": 8.44280442804428, + "grad_norm": 0.62380039781695, + "learning_rate": 1.5235854220391653e-06, + "loss": 0.1305, + "num_tokens": 126837789.0, + "step": 1148 + }, + { + "epoch": 8.450184501845019, + "grad_norm": 0.6278604652828446, + "learning_rate": 1.518540843646632e-06, + "loss": 0.0936, + "num_tokens": 126977414.0, + "step": 1149 + }, + { + "epoch": 8.457564575645756, + "grad_norm": 0.6673911003891321, + "learning_rate": 1.5135191983405767e-06, + "loss": 0.1149, + "num_tokens": 127094861.0, + "step": 1150 + }, + { + "epoch": 8.464944649446494, + "grad_norm": 0.5899818625924016, + "learning_rate": 1.5085205150455266e-06, + "loss": 0.1013, + "num_tokens": 127222864.0, + "step": 1151 + }, + { + "epoch": 8.472324723247233, + "grad_norm": 0.5779369770190897, + "learning_rate": 1.5035448225537493e-06, + "loss": 0.097, + "num_tokens": 127336404.0, + "step": 1152 + }, + { + "epoch": 8.47970479704797, + "grad_norm": 0.7033787316439764, + "learning_rate": 1.4985921495250852e-06, + "loss": 0.1124, + "num_tokens": 127444110.0, + "step": 1153 + }, + { + "epoch": 8.487084870848708, + "grad_norm": 0.7973835737567503, + "learning_rate": 1.4936625244867845e-06, + "loss": 0.1048, + "num_tokens": 127540650.0, + "step": 1154 + }, + { + "epoch": 8.494464944649447, + "grad_norm": 0.6013487090031089, + "learning_rate": 1.4887559758333408e-06, + "loss": 0.1008, + "num_tokens": 127655205.0, + "step": 1155 + }, + { + "epoch": 8.501845018450185, + "grad_norm": 0.5842927860924219, + "learning_rate": 1.4838725318263273e-06, + "loss": 0.1021, + "num_tokens": 127747864.0, + "step": 1156 + }, + { + "epoch": 8.509225092250922, + "grad_norm": 0.6318394517015664, + "learning_rate": 1.4790122205942387e-06, + "loss": 0.1166, + "num_tokens": 127864117.0, + "step": 1157 + }, + { + "epoch": 8.51660516605166, + "grad_norm": 0.6459305199339671, + "learning_rate": 1.474175070132322e-06, + "loss": 0.1149, + "num_tokens": 127971408.0, + "step": 1158 + }, + { + "epoch": 8.523985239852399, + "grad_norm": 0.5531547408810842, + "learning_rate": 1.4693611083024209e-06, + "loss": 0.2405, + "num_tokens": 128069774.0, + "step": 1159 + }, + { + "epoch": 8.531365313653136, + "grad_norm": 0.6167450947611348, + "learning_rate": 1.464570362832812e-06, + "loss": 0.0934, + "num_tokens": 128180206.0, + "step": 1160 + }, + { + "epoch": 8.538745387453874, + "grad_norm": 0.6232744033713421, + "learning_rate": 1.4598028613180468e-06, + "loss": 0.0963, + "num_tokens": 128299598.0, + "step": 1161 + }, + { + "epoch": 8.546125461254613, + "grad_norm": 0.744957544001579, + "learning_rate": 1.455058631218792e-06, + "loss": 0.0975, + "num_tokens": 128391605.0, + "step": 1162 + }, + { + "epoch": 8.55350553505535, + "grad_norm": 0.7185687853758047, + "learning_rate": 1.450337699861673e-06, + "loss": 0.1018, + "num_tokens": 128494086.0, + "step": 1163 + }, + { + "epoch": 8.560885608856088, + "grad_norm": 0.6598711587628634, + "learning_rate": 1.4456400944391147e-06, + "loss": 0.1067, + "num_tokens": 128627405.0, + "step": 1164 + }, + { + "epoch": 8.568265682656827, + "grad_norm": 0.7096790520934679, + "learning_rate": 1.440965842009182e-06, + "loss": 0.0955, + "num_tokens": 128709021.0, + "step": 1165 + }, + { + "epoch": 8.575645756457565, + "grad_norm": 0.7331549322698515, + "learning_rate": 1.4363149694954335e-06, + "loss": 0.4428, + "num_tokens": 128879133.0, + "step": 1166 + }, + { + "epoch": 8.583025830258302, + "grad_norm": 0.5965199139226404, + "learning_rate": 1.4316875036867555e-06, + "loss": 0.0988, + "num_tokens": 129002145.0, + "step": 1167 + }, + { + "epoch": 8.59040590405904, + "grad_norm": 0.6655629449526129, + "learning_rate": 1.427083471237213e-06, + "loss": 0.1119, + "num_tokens": 129097000.0, + "step": 1168 + }, + { + "epoch": 8.597785977859779, + "grad_norm": 0.7752488403562106, + "learning_rate": 1.4225028986658967e-06, + "loss": 0.1134, + "num_tokens": 129211370.0, + "step": 1169 + }, + { + "epoch": 8.605166051660516, + "grad_norm": 0.6568649340461798, + "learning_rate": 1.4179458123567677e-06, + "loss": 0.0988, + "num_tokens": 129314954.0, + "step": 1170 + }, + { + "epoch": 8.612546125461254, + "grad_norm": 0.636569631920654, + "learning_rate": 1.4134122385585092e-06, + "loss": 0.1287, + "num_tokens": 129433148.0, + "step": 1171 + }, + { + "epoch": 8.619926199261993, + "grad_norm": 0.5776985376253863, + "learning_rate": 1.4089022033843704e-06, + "loss": 0.1003, + "num_tokens": 129527036.0, + "step": 1172 + }, + { + "epoch": 8.62730627306273, + "grad_norm": 0.6437073737800211, + "learning_rate": 1.4044157328120208e-06, + "loss": 0.1248, + "num_tokens": 129626076.0, + "step": 1173 + }, + { + "epoch": 8.634686346863468, + "grad_norm": 0.6120913429381284, + "learning_rate": 1.3999528526833961e-06, + "loss": 0.1252, + "num_tokens": 129744041.0, + "step": 1174 + }, + { + "epoch": 8.642066420664207, + "grad_norm": 0.692750541310882, + "learning_rate": 1.3955135887045554e-06, + "loss": 0.1085, + "num_tokens": 129825567.0, + "step": 1175 + }, + { + "epoch": 8.649446494464945, + "grad_norm": 0.6024998881516186, + "learning_rate": 1.391097966445526e-06, + "loss": 0.11, + "num_tokens": 129957480.0, + "step": 1176 + }, + { + "epoch": 8.656826568265682, + "grad_norm": 0.718906436566963, + "learning_rate": 1.3867060113401618e-06, + "loss": 0.1214, + "num_tokens": 130078579.0, + "step": 1177 + }, + { + "epoch": 8.664206642066421, + "grad_norm": 0.5810701455364705, + "learning_rate": 1.382337748685993e-06, + "loss": 0.1346, + "num_tokens": 130191379.0, + "step": 1178 + }, + { + "epoch": 8.671586715867159, + "grad_norm": 0.7278544423341184, + "learning_rate": 1.377993203644083e-06, + "loss": 0.1081, + "num_tokens": 130280843.0, + "step": 1179 + }, + { + "epoch": 8.678966789667896, + "grad_norm": 0.6592718069090931, + "learning_rate": 1.3736724012388813e-06, + "loss": 0.1055, + "num_tokens": 130378047.0, + "step": 1180 + }, + { + "epoch": 8.686346863468636, + "grad_norm": 0.7278188201434425, + "learning_rate": 1.3693753663580834e-06, + "loss": 0.1002, + "num_tokens": 130497156.0, + "step": 1181 + }, + { + "epoch": 8.693726937269373, + "grad_norm": 0.6022490730052281, + "learning_rate": 1.3651021237524808e-06, + "loss": 0.0942, + "num_tokens": 130594120.0, + "step": 1182 + }, + { + "epoch": 8.70110701107011, + "grad_norm": 0.6312445278566866, + "learning_rate": 1.3608526980358245e-06, + "loss": 0.1063, + "num_tokens": 130698352.0, + "step": 1183 + }, + { + "epoch": 8.708487084870848, + "grad_norm": 0.5726086122274623, + "learning_rate": 1.3566271136846811e-06, + "loss": 0.0885, + "num_tokens": 130799947.0, + "step": 1184 + }, + { + "epoch": 8.715867158671587, + "grad_norm": 0.6012335783882514, + "learning_rate": 1.3524253950382904e-06, + "loss": 0.1255, + "num_tokens": 130925327.0, + "step": 1185 + }, + { + "epoch": 8.723247232472325, + "grad_norm": 0.6150543976980224, + "learning_rate": 1.3482475662984273e-06, + "loss": 0.131, + "num_tokens": 131041478.0, + "step": 1186 + }, + { + "epoch": 8.730627306273062, + "grad_norm": 0.6554155180125351, + "learning_rate": 1.3440936515292608e-06, + "loss": 0.1106, + "num_tokens": 131138021.0, + "step": 1187 + }, + { + "epoch": 8.738007380073801, + "grad_norm": 0.569887429711919, + "learning_rate": 1.3399636746572167e-06, + "loss": 0.1124, + "num_tokens": 131249631.0, + "step": 1188 + }, + { + "epoch": 8.745387453874539, + "grad_norm": 0.5518376342567334, + "learning_rate": 1.335857659470839e-06, + "loss": 0.0887, + "num_tokens": 131352987.0, + "step": 1189 + }, + { + "epoch": 8.752767527675276, + "grad_norm": 0.6139149470107075, + "learning_rate": 1.331775629620653e-06, + "loss": 0.1144, + "num_tokens": 131471711.0, + "step": 1190 + }, + { + "epoch": 8.760147601476016, + "grad_norm": 0.6318161785490038, + "learning_rate": 1.3277176086190296e-06, + "loss": 0.1423, + "num_tokens": 131575327.0, + "step": 1191 + }, + { + "epoch": 8.767527675276753, + "grad_norm": 0.6197795416833018, + "learning_rate": 1.3236836198400501e-06, + "loss": 0.1422, + "num_tokens": 131697862.0, + "step": 1192 + }, + { + "epoch": 8.77490774907749, + "grad_norm": 0.59185234264299, + "learning_rate": 1.3196736865193687e-06, + "loss": 0.1351, + "num_tokens": 131814771.0, + "step": 1193 + }, + { + "epoch": 8.782287822878228, + "grad_norm": 0.6263246337780235, + "learning_rate": 1.3156878317540835e-06, + "loss": 0.1207, + "num_tokens": 131936657.0, + "step": 1194 + }, + { + "epoch": 8.789667896678967, + "grad_norm": 0.6182091169106162, + "learning_rate": 1.3117260785025987e-06, + "loss": 0.1132, + "num_tokens": 132059032.0, + "step": 1195 + }, + { + "epoch": 8.797047970479705, + "grad_norm": 0.5904336267135564, + "learning_rate": 1.3077884495844956e-06, + "loss": 0.1427, + "num_tokens": 132158292.0, + "step": 1196 + }, + { + "epoch": 8.804428044280442, + "grad_norm": 0.6131679209745687, + "learning_rate": 1.3038749676803994e-06, + "loss": 0.1015, + "num_tokens": 132264400.0, + "step": 1197 + }, + { + "epoch": 8.811808118081181, + "grad_norm": 0.5884127696383595, + "learning_rate": 1.29998565533185e-06, + "loss": 0.108, + "num_tokens": 132352118.0, + "step": 1198 + }, + { + "epoch": 8.819188191881919, + "grad_norm": 0.5703295517477384, + "learning_rate": 1.296120534941171e-06, + "loss": 0.11, + "num_tokens": 132459362.0, + "step": 1199 + }, + { + "epoch": 8.826568265682656, + "grad_norm": 0.607194949149482, + "learning_rate": 1.2922796287713413e-06, + "loss": 0.0988, + "num_tokens": 132568274.0, + "step": 1200 + }, + { + "epoch": 8.833948339483396, + "grad_norm": 0.6442517723656137, + "learning_rate": 1.2884629589458653e-06, + "loss": 0.0941, + "num_tokens": 132662599.0, + "step": 1201 + }, + { + "epoch": 8.841328413284133, + "grad_norm": 0.615550035098192, + "learning_rate": 1.284670547448649e-06, + "loss": 0.1251, + "num_tokens": 132822501.0, + "step": 1202 + }, + { + "epoch": 8.84870848708487, + "grad_norm": 0.5943999683591958, + "learning_rate": 1.2809024161238699e-06, + "loss": 0.1093, + "num_tokens": 132927007.0, + "step": 1203 + }, + { + "epoch": 8.85608856088561, + "grad_norm": 0.5551004930503878, + "learning_rate": 1.277158586675852e-06, + "loss": 0.1034, + "num_tokens": 133036775.0, + "step": 1204 + }, + { + "epoch": 8.863468634686347, + "grad_norm": 0.6596181804090301, + "learning_rate": 1.2734390806689422e-06, + "loss": 0.1145, + "num_tokens": 133142874.0, + "step": 1205 + }, + { + "epoch": 8.870848708487085, + "grad_norm": 0.6267524284467567, + "learning_rate": 1.269743919527384e-06, + "loss": 0.0971, + "num_tokens": 133265583.0, + "step": 1206 + }, + { + "epoch": 8.878228782287822, + "grad_norm": 0.583583838659581, + "learning_rate": 1.2660731245351962e-06, + "loss": 0.1193, + "num_tokens": 133387180.0, + "step": 1207 + }, + { + "epoch": 8.885608856088561, + "grad_norm": 0.6111191321370599, + "learning_rate": 1.2624267168360479e-06, + "loss": 0.0918, + "num_tokens": 133474575.0, + "step": 1208 + }, + { + "epoch": 8.892988929889299, + "grad_norm": 0.5929732389499657, + "learning_rate": 1.2588047174331417e-06, + "loss": 0.1459, + "num_tokens": 133631311.0, + "step": 1209 + }, + { + "epoch": 8.900369003690036, + "grad_norm": 0.7561200036170181, + "learning_rate": 1.2552071471890839e-06, + "loss": 0.096, + "num_tokens": 133716163.0, + "step": 1210 + }, + { + "epoch": 8.907749077490775, + "grad_norm": 0.5415207972997961, + "learning_rate": 1.2516340268257737e-06, + "loss": 0.0854, + "num_tokens": 133827950.0, + "step": 1211 + }, + { + "epoch": 8.915129151291513, + "grad_norm": 0.7459812398595722, + "learning_rate": 1.248085376924278e-06, + "loss": 0.1236, + "num_tokens": 133927878.0, + "step": 1212 + }, + { + "epoch": 8.92250922509225, + "grad_norm": 0.634841479109334, + "learning_rate": 1.2445612179247147e-06, + "loss": 0.0921, + "num_tokens": 134020539.0, + "step": 1213 + }, + { + "epoch": 8.92988929889299, + "grad_norm": 0.6858286771716869, + "learning_rate": 1.2410615701261342e-06, + "loss": 0.1105, + "num_tokens": 134122150.0, + "step": 1214 + }, + { + "epoch": 8.937269372693727, + "grad_norm": 0.6201594087632721, + "learning_rate": 1.2375864536864055e-06, + "loss": 0.116, + "num_tokens": 134251378.0, + "step": 1215 + }, + { + "epoch": 8.944649446494465, + "grad_norm": 0.6048767594107017, + "learning_rate": 1.2341358886220942e-06, + "loss": 0.1395, + "num_tokens": 134358760.0, + "step": 1216 + }, + { + "epoch": 8.952029520295202, + "grad_norm": 0.577285117710034, + "learning_rate": 1.2307098948083538e-06, + "loss": 0.1262, + "num_tokens": 134509061.0, + "step": 1217 + }, + { + "epoch": 8.959409594095941, + "grad_norm": 0.576726954166539, + "learning_rate": 1.2273084919788066e-06, + "loss": 0.0936, + "num_tokens": 134588340.0, + "step": 1218 + }, + { + "epoch": 8.966789667896679, + "grad_norm": 0.5930291051290446, + "learning_rate": 1.2239316997254328e-06, + "loss": 0.0821, + "num_tokens": 134689789.0, + "step": 1219 + }, + { + "epoch": 8.974169741697416, + "grad_norm": 0.5702699250144813, + "learning_rate": 1.220579537498454e-06, + "loss": 0.0815, + "num_tokens": 134783139.0, + "step": 1220 + }, + { + "epoch": 8.981549815498155, + "grad_norm": 0.5278088449176532, + "learning_rate": 1.2172520246062257e-06, + "loss": 0.1046, + "num_tokens": 134893686.0, + "step": 1221 + }, + { + "epoch": 8.988929889298893, + "grad_norm": 0.6068798117027167, + "learning_rate": 1.2139491802151235e-06, + "loss": 0.1081, + "num_tokens": 135012267.0, + "step": 1222 + }, + { + "epoch": 8.99630996309963, + "grad_norm": 0.6735049740696725, + "learning_rate": 1.2106710233494326e-06, + "loss": 0.1339, + "num_tokens": 135111751.0, + "step": 1223 + }, + { + "epoch": 9.0, + "grad_norm": 0.6735049740696725, + "learning_rate": 1.2074175728912397e-06, + "loss": 0.1233, + "num_tokens": 135169229.0, + "step": 1224 + }, + { + "epoch": 9.007380073800737, + "grad_norm": 1.098013608941204, + "learning_rate": 1.2041888475803217e-06, + "loss": 0.1049, + "num_tokens": 135282785.0, + "step": 1225 + }, + { + "epoch": 9.014760147601477, + "grad_norm": 0.5532199525343341, + "learning_rate": 1.200984866014041e-06, + "loss": 0.0718, + "num_tokens": 135399869.0, + "step": 1226 + }, + { + "epoch": 9.022140221402214, + "grad_norm": 0.40362892252012594, + "learning_rate": 1.1978056466472373e-06, + "loss": 0.0798, + "num_tokens": 135500867.0, + "step": 1227 + }, + { + "epoch": 9.029520295202952, + "grad_norm": 0.43778480941123277, + "learning_rate": 1.1946512077921186e-06, + "loss": 0.0692, + "num_tokens": 135656577.0, + "step": 1228 + }, + { + "epoch": 9.03690036900369, + "grad_norm": 0.43434488006320376, + "learning_rate": 1.1915215676181597e-06, + "loss": 0.0769, + "num_tokens": 135758420.0, + "step": 1229 + }, + { + "epoch": 9.044280442804428, + "grad_norm": 0.4740122911856107, + "learning_rate": 1.1884167441519944e-06, + "loss": 0.0936, + "num_tokens": 135852458.0, + "step": 1230 + }, + { + "epoch": 9.051660516605166, + "grad_norm": 0.5561195948834813, + "learning_rate": 1.1853367552773136e-06, + "loss": 0.0773, + "num_tokens": 135946524.0, + "step": 1231 + }, + { + "epoch": 9.059040590405903, + "grad_norm": 0.6173578371571918, + "learning_rate": 1.1822816187347625e-06, + "loss": 0.0913, + "num_tokens": 136060410.0, + "step": 1232 + }, + { + "epoch": 9.066420664206642, + "grad_norm": 0.5668208716319202, + "learning_rate": 1.1792513521218355e-06, + "loss": 0.093, + "num_tokens": 136161844.0, + "step": 1233 + }, + { + "epoch": 9.07380073800738, + "grad_norm": 0.4862826369270734, + "learning_rate": 1.1762459728927795e-06, + "loss": 0.0906, + "num_tokens": 136254874.0, + "step": 1234 + }, + { + "epoch": 9.081180811808117, + "grad_norm": 0.6892251066526217, + "learning_rate": 1.1732654983584896e-06, + "loss": 0.0657, + "num_tokens": 136349326.0, + "step": 1235 + }, + { + "epoch": 9.088560885608857, + "grad_norm": 0.4569367554195257, + "learning_rate": 1.1703099456864097e-06, + "loss": 0.1044, + "num_tokens": 136480736.0, + "step": 1236 + }, + { + "epoch": 9.095940959409594, + "grad_norm": 0.7440492203877666, + "learning_rate": 1.1673793319004364e-06, + "loss": 0.0722, + "num_tokens": 136584305.0, + "step": 1237 + }, + { + "epoch": 9.103321033210332, + "grad_norm": 0.5359395869253063, + "learning_rate": 1.1644736738808176e-06, + "loss": 0.0859, + "num_tokens": 136663948.0, + "step": 1238 + }, + { + "epoch": 9.11070110701107, + "grad_norm": 0.7091484523231024, + "learning_rate": 1.1615929883640569e-06, + "loss": 0.2368, + "num_tokens": 136814952.0, + "step": 1239 + }, + { + "epoch": 9.118081180811808, + "grad_norm": 0.5697829422677767, + "learning_rate": 1.1587372919428174e-06, + "loss": 0.1117, + "num_tokens": 136947100.0, + "step": 1240 + }, + { + "epoch": 9.125461254612546, + "grad_norm": 0.48117422623071765, + "learning_rate": 1.1559066010658262e-06, + "loss": 0.0748, + "num_tokens": 137063091.0, + "step": 1241 + }, + { + "epoch": 9.132841328413285, + "grad_norm": 0.5387899819580395, + "learning_rate": 1.1531009320377783e-06, + "loss": 0.092, + "num_tokens": 137183371.0, + "step": 1242 + }, + { + "epoch": 9.140221402214022, + "grad_norm": 0.5058630416412286, + "learning_rate": 1.1503203010192432e-06, + "loss": 0.0731, + "num_tokens": 137305263.0, + "step": 1243 + }, + { + "epoch": 9.14760147601476, + "grad_norm": 0.5168951180662867, + "learning_rate": 1.1475647240265746e-06, + "loss": 0.0879, + "num_tokens": 137410135.0, + "step": 1244 + }, + { + "epoch": 9.154981549815497, + "grad_norm": 0.5688618299675912, + "learning_rate": 1.144834216931813e-06, + "loss": 0.134, + "num_tokens": 137546953.0, + "step": 1245 + }, + { + "epoch": 9.162361623616237, + "grad_norm": 0.5328692971375418, + "learning_rate": 1.1421287954625988e-06, + "loss": 0.0958, + "num_tokens": 137634130.0, + "step": 1246 + }, + { + "epoch": 9.169741697416974, + "grad_norm": 0.5054910562846227, + "learning_rate": 1.1394484752020784e-06, + "loss": 0.0765, + "num_tokens": 137736988.0, + "step": 1247 + }, + { + "epoch": 9.177121771217712, + "grad_norm": 0.4933553344922545, + "learning_rate": 1.1367932715888178e-06, + "loss": 0.0692, + "num_tokens": 137828899.0, + "step": 1248 + }, + { + "epoch": 9.18450184501845, + "grad_norm": 0.7450842938914303, + "learning_rate": 1.1341631999167104e-06, + "loss": 0.0865, + "num_tokens": 137946709.0, + "step": 1249 + }, + { + "epoch": 9.191881918819188, + "grad_norm": 0.4473386593384036, + "learning_rate": 1.131558275334891e-06, + "loss": 0.0823, + "num_tokens": 138028535.0, + "step": 1250 + }, + { + "epoch": 9.199261992619926, + "grad_norm": 0.47492406258889297, + "learning_rate": 1.1289785128476476e-06, + "loss": 0.0942, + "num_tokens": 138134748.0, + "step": 1251 + }, + { + "epoch": 9.206642066420665, + "grad_norm": 0.4362471317660537, + "learning_rate": 1.1264239273143356e-06, + "loss": 0.0869, + "num_tokens": 138265869.0, + "step": 1252 + }, + { + "epoch": 9.214022140221402, + "grad_norm": 0.5772043869677019, + "learning_rate": 1.1238945334492929e-06, + "loss": 0.0778, + "num_tokens": 138352099.0, + "step": 1253 + }, + { + "epoch": 9.22140221402214, + "grad_norm": 0.4873687523841293, + "learning_rate": 1.1213903458217511e-06, + "loss": 0.0838, + "num_tokens": 138480445.0, + "step": 1254 + }, + { + "epoch": 9.228782287822877, + "grad_norm": 0.5725491626503895, + "learning_rate": 1.1189113788557584e-06, + "loss": 0.0685, + "num_tokens": 138564835.0, + "step": 1255 + }, + { + "epoch": 9.236162361623617, + "grad_norm": 0.5378442811206418, + "learning_rate": 1.1164576468300897e-06, + "loss": 0.0864, + "num_tokens": 138679521.0, + "step": 1256 + }, + { + "epoch": 9.243542435424354, + "grad_norm": 0.4831890277582963, + "learning_rate": 1.114029163878169e-06, + "loss": 0.0646, + "num_tokens": 138766817.0, + "step": 1257 + }, + { + "epoch": 9.250922509225092, + "grad_norm": 0.5119863473235919, + "learning_rate": 1.1116259439879859e-06, + "loss": 0.0905, + "num_tokens": 138869322.0, + "step": 1258 + }, + { + "epoch": 9.25830258302583, + "grad_norm": 0.6275481878257461, + "learning_rate": 1.1092480010020154e-06, + "loss": 0.07, + "num_tokens": 138941411.0, + "step": 1259 + }, + { + "epoch": 9.265682656826568, + "grad_norm": 0.621644017095464, + "learning_rate": 1.1068953486171387e-06, + "loss": 0.1209, + "num_tokens": 139073677.0, + "step": 1260 + }, + { + "epoch": 9.273062730627306, + "grad_norm": 0.6410095734349077, + "learning_rate": 1.1045680003845635e-06, + "loss": 0.5559, + "num_tokens": 139213583.0, + "step": 1261 + }, + { + "epoch": 9.280442804428045, + "grad_norm": 0.4791056304666212, + "learning_rate": 1.1022659697097466e-06, + "loss": 0.1183, + "num_tokens": 139315732.0, + "step": 1262 + }, + { + "epoch": 9.287822878228782, + "grad_norm": 0.6139676068282646, + "learning_rate": 1.099989269852317e-06, + "loss": 0.0826, + "num_tokens": 139406775.0, + "step": 1263 + }, + { + "epoch": 9.29520295202952, + "grad_norm": 0.5727542877358448, + "learning_rate": 1.0977379139259968e-06, + "loss": 0.0715, + "num_tokens": 139490445.0, + "step": 1264 + }, + { + "epoch": 9.302583025830259, + "grad_norm": 0.4255032842147736, + "learning_rate": 1.0955119148985302e-06, + "loss": 0.0752, + "num_tokens": 139618468.0, + "step": 1265 + }, + { + "epoch": 9.309963099630997, + "grad_norm": 0.5139540382949935, + "learning_rate": 1.0933112855916057e-06, + "loss": 0.0916, + "num_tokens": 139694374.0, + "step": 1266 + }, + { + "epoch": 9.317343173431734, + "grad_norm": 0.7147664067984917, + "learning_rate": 1.0911360386807814e-06, + "loss": 0.1061, + "num_tokens": 139827843.0, + "step": 1267 + }, + { + "epoch": 9.324723247232471, + "grad_norm": 0.6217915110979024, + "learning_rate": 1.0889861866954165e-06, + "loss": 0.0628, + "num_tokens": 139932395.0, + "step": 1268 + }, + { + "epoch": 9.33210332103321, + "grad_norm": 0.47302488760885186, + "learning_rate": 1.0868617420185935e-06, + "loss": 0.0694, + "num_tokens": 140044988.0, + "step": 1269 + }, + { + "epoch": 9.339483394833948, + "grad_norm": 0.4898432241818497, + "learning_rate": 1.084762716887051e-06, + "loss": 0.0637, + "num_tokens": 140143951.0, + "step": 1270 + }, + { + "epoch": 9.346863468634686, + "grad_norm": 0.45565681545242087, + "learning_rate": 1.0826891233911122e-06, + "loss": 0.1, + "num_tokens": 140289486.0, + "step": 1271 + }, + { + "epoch": 9.354243542435425, + "grad_norm": 0.49835460213670446, + "learning_rate": 1.0806409734746128e-06, + "loss": 0.1157, + "num_tokens": 140452034.0, + "step": 1272 + }, + { + "epoch": 9.361623616236162, + "grad_norm": 0.5586254064308035, + "learning_rate": 1.0786182789348357e-06, + "loss": 0.0772, + "num_tokens": 140559290.0, + "step": 1273 + }, + { + "epoch": 9.3690036900369, + "grad_norm": 0.36499301388851546, + "learning_rate": 1.076621051422442e-06, + "loss": 0.0707, + "num_tokens": 140648021.0, + "step": 1274 + }, + { + "epoch": 9.376383763837639, + "grad_norm": 0.6315911456891137, + "learning_rate": 1.0746493024414028e-06, + "loss": 0.0701, + "num_tokens": 140760770.0, + "step": 1275 + }, + { + "epoch": 9.383763837638377, + "grad_norm": 0.4439501077654831, + "learning_rate": 1.0727030433489331e-06, + "loss": 0.0839, + "num_tokens": 140837990.0, + "step": 1276 + }, + { + "epoch": 9.391143911439114, + "grad_norm": 0.6294163873714306, + "learning_rate": 1.0707822853554275e-06, + "loss": 0.067, + "num_tokens": 140916144.0, + "step": 1277 + }, + { + "epoch": 9.398523985239853, + "grad_norm": 0.47592531161352836, + "learning_rate": 1.068887039524395e-06, + "loss": 0.0898, + "num_tokens": 141012735.0, + "step": 1278 + }, + { + "epoch": 9.40590405904059, + "grad_norm": 0.5731828438615216, + "learning_rate": 1.067017316772396e-06, + "loss": 0.0842, + "num_tokens": 141145409.0, + "step": 1279 + }, + { + "epoch": 9.413284132841328, + "grad_norm": 0.5369622337944866, + "learning_rate": 1.0651731278689773e-06, + "loss": 0.0999, + "num_tokens": 141293141.0, + "step": 1280 + }, + { + "epoch": 9.420664206642066, + "grad_norm": 0.5210810547850394, + "learning_rate": 1.0633544834366125e-06, + "loss": 0.0905, + "num_tokens": 141424378.0, + "step": 1281 + }, + { + "epoch": 9.428044280442805, + "grad_norm": 0.5441370628754586, + "learning_rate": 1.0615613939506392e-06, + "loss": 0.0573, + "num_tokens": 141534164.0, + "step": 1282 + }, + { + "epoch": 9.435424354243542, + "grad_norm": 0.4882480772188857, + "learning_rate": 1.0597938697392002e-06, + "loss": 0.0863, + "num_tokens": 141658150.0, + "step": 1283 + }, + { + "epoch": 9.44280442804428, + "grad_norm": 0.5744920846110795, + "learning_rate": 1.0580519209831818e-06, + "loss": 0.067, + "num_tokens": 141766293.0, + "step": 1284 + }, + { + "epoch": 9.450184501845019, + "grad_norm": 0.4094733363503645, + "learning_rate": 1.0563355577161578e-06, + "loss": 0.0754, + "num_tokens": 141860600.0, + "step": 1285 + }, + { + "epoch": 9.457564575645756, + "grad_norm": 0.5493452770826364, + "learning_rate": 1.0546447898243282e-06, + "loss": 0.0796, + "num_tokens": 141953505.0, + "step": 1286 + }, + { + "epoch": 9.464944649446494, + "grad_norm": 0.5578214877769143, + "learning_rate": 1.0529796270464674e-06, + "loss": 0.0548, + "num_tokens": 142055360.0, + "step": 1287 + }, + { + "epoch": 9.472324723247233, + "grad_norm": 0.49089515513264637, + "learning_rate": 1.0513400789738631e-06, + "loss": 0.0899, + "num_tokens": 142146667.0, + "step": 1288 + }, + { + "epoch": 9.47970479704797, + "grad_norm": 0.6918407979557708, + "learning_rate": 1.0497261550502631e-06, + "loss": 0.0858, + "num_tokens": 142230862.0, + "step": 1289 + }, + { + "epoch": 9.487084870848708, + "grad_norm": 0.49566833014774225, + "learning_rate": 1.0481378645718215e-06, + "loss": 0.0925, + "num_tokens": 142362429.0, + "step": 1290 + }, + { + "epoch": 9.494464944649447, + "grad_norm": 0.485006343857991, + "learning_rate": 1.0465752166870445e-06, + "loss": 0.073, + "num_tokens": 142454056.0, + "step": 1291 + }, + { + "epoch": 9.501845018450185, + "grad_norm": 0.5801596165920954, + "learning_rate": 1.0450382203967372e-06, + "loss": 0.0774, + "num_tokens": 142565299.0, + "step": 1292 + }, + { + "epoch": 9.509225092250922, + "grad_norm": 0.529155775721648, + "learning_rate": 1.043526884553953e-06, + "loss": 0.0638, + "num_tokens": 142659189.0, + "step": 1293 + }, + { + "epoch": 9.51660516605166, + "grad_norm": 0.4651717505933552, + "learning_rate": 1.0420412178639408e-06, + "loss": 0.0918, + "num_tokens": 142785942.0, + "step": 1294 + }, + { + "epoch": 9.523985239852399, + "grad_norm": 0.5415557009247054, + "learning_rate": 1.0405812288840967e-06, + "loss": 0.1002, + "num_tokens": 142905415.0, + "step": 1295 + }, + { + "epoch": 9.531365313653136, + "grad_norm": 0.5857032868286894, + "learning_rate": 1.0391469260239146e-06, + "loss": 0.1, + "num_tokens": 143046818.0, + "step": 1296 + }, + { + "epoch": 9.538745387453874, + "grad_norm": 0.5447518796281599, + "learning_rate": 1.037738317544936e-06, + "loss": 0.0913, + "num_tokens": 143154055.0, + "step": 1297 + }, + { + "epoch": 9.546125461254613, + "grad_norm": 0.5125117807248399, + "learning_rate": 1.036355411560703e-06, + "loss": 0.0651, + "num_tokens": 143263376.0, + "step": 1298 + }, + { + "epoch": 9.55350553505535, + "grad_norm": 0.48226465374189814, + "learning_rate": 1.0349982160367146e-06, + "loss": 0.0635, + "num_tokens": 143363961.0, + "step": 1299 + }, + { + "epoch": 9.560885608856088, + "grad_norm": 0.4642815323636672, + "learning_rate": 1.0336667387903755e-06, + "loss": 0.072, + "num_tokens": 143479639.0, + "step": 1300 + }, + { + "epoch": 9.568265682656827, + "grad_norm": 0.6511448364627623, + "learning_rate": 1.0323609874909552e-06, + "loss": 0.0861, + "num_tokens": 143564742.0, + "step": 1301 + }, + { + "epoch": 9.575645756457565, + "grad_norm": 0.49420521238167936, + "learning_rate": 1.0310809696595431e-06, + "loss": 0.081, + "num_tokens": 143660847.0, + "step": 1302 + }, + { + "epoch": 9.583025830258302, + "grad_norm": 0.6949716240687748, + "learning_rate": 1.029826692669003e-06, + "loss": 0.0969, + "num_tokens": 143767931.0, + "step": 1303 + }, + { + "epoch": 9.59040590405904, + "grad_norm": 0.6194488779322029, + "learning_rate": 1.028598163743934e-06, + "loss": 0.1239, + "num_tokens": 143909274.0, + "step": 1304 + }, + { + "epoch": 9.597785977859779, + "grad_norm": 0.44564646128264457, + "learning_rate": 1.0273953899606256e-06, + "loss": 0.0949, + "num_tokens": 144015506.0, + "step": 1305 + }, + { + "epoch": 9.605166051660516, + "grad_norm": 0.5191765591583786, + "learning_rate": 1.0262183782470191e-06, + "loss": 0.1157, + "num_tokens": 144142602.0, + "step": 1306 + }, + { + "epoch": 9.612546125461254, + "grad_norm": 0.6760532101276266, + "learning_rate": 1.025067135382667e-06, + "loss": 0.136, + "num_tokens": 144257871.0, + "step": 1307 + }, + { + "epoch": 9.619926199261993, + "grad_norm": 0.5689254661964385, + "learning_rate": 1.0239416679986947e-06, + "loss": 0.0731, + "num_tokens": 144385398.0, + "step": 1308 + }, + { + "epoch": 9.62730627306273, + "grad_norm": 0.525375097114315, + "learning_rate": 1.0228419825777603e-06, + "loss": 0.0893, + "num_tokens": 144473651.0, + "step": 1309 + }, + { + "epoch": 9.634686346863468, + "grad_norm": 0.546320906132776, + "learning_rate": 1.021768085454019e-06, + "loss": 0.0915, + "num_tokens": 144586199.0, + "step": 1310 + }, + { + "epoch": 9.642066420664207, + "grad_norm": 0.7454288817246733, + "learning_rate": 1.0207199828130867e-06, + "loss": 0.0808, + "num_tokens": 144672354.0, + "step": 1311 + }, + { + "epoch": 9.649446494464945, + "grad_norm": 0.5691877153607596, + "learning_rate": 1.0196976806920026e-06, + "loss": 0.1098, + "num_tokens": 144799218.0, + "step": 1312 + }, + { + "epoch": 9.656826568265682, + "grad_norm": 0.5613847781133986, + "learning_rate": 1.018701184979198e-06, + "loss": 0.1433, + "num_tokens": 144932569.0, + "step": 1313 + }, + { + "epoch": 9.664206642066421, + "grad_norm": 0.7075961001446242, + "learning_rate": 1.0177305014144579e-06, + "loss": 0.1632, + "num_tokens": 145094329.0, + "step": 1314 + }, + { + "epoch": 9.671586715867159, + "grad_norm": 0.4949014137778886, + "learning_rate": 1.0167856355888906e-06, + "loss": 0.0847, + "num_tokens": 145200137.0, + "step": 1315 + }, + { + "epoch": 9.678966789667896, + "grad_norm": 0.5777610491503402, + "learning_rate": 1.0158665929448951e-06, + "loss": 0.0783, + "num_tokens": 145299005.0, + "step": 1316 + }, + { + "epoch": 9.686346863468636, + "grad_norm": 0.5666895663706422, + "learning_rate": 1.0149733787761306e-06, + "loss": 0.1045, + "num_tokens": 145417438.0, + "step": 1317 + }, + { + "epoch": 9.693726937269373, + "grad_norm": 0.4976340719962353, + "learning_rate": 1.0141059982274833e-06, + "loss": 0.0977, + "num_tokens": 145527495.0, + "step": 1318 + }, + { + "epoch": 9.70110701107011, + "grad_norm": 0.5527340720451464, + "learning_rate": 1.0132644562950395e-06, + "loss": 0.0913, + "num_tokens": 145660524.0, + "step": 1319 + }, + { + "epoch": 9.708487084870848, + "grad_norm": 0.5971966074249265, + "learning_rate": 1.0124487578260562e-06, + "loss": 0.1005, + "num_tokens": 145804173.0, + "step": 1320 + }, + { + "epoch": 9.715867158671587, + "grad_norm": 0.5695738798252161, + "learning_rate": 1.011658907518932e-06, + "loss": 0.0968, + "num_tokens": 145912414.0, + "step": 1321 + }, + { + "epoch": 9.723247232472325, + "grad_norm": 0.5672640649845705, + "learning_rate": 1.010894909923181e-06, + "loss": 0.073, + "num_tokens": 146020616.0, + "step": 1322 + }, + { + "epoch": 9.730627306273062, + "grad_norm": 0.5872464338893862, + "learning_rate": 1.0101567694394073e-06, + "loss": 0.1033, + "num_tokens": 146111381.0, + "step": 1323 + }, + { + "epoch": 9.738007380073801, + "grad_norm": 0.6153149906400658, + "learning_rate": 1.0094444903192775e-06, + "loss": 0.0896, + "num_tokens": 146231059.0, + "step": 1324 + }, + { + "epoch": 9.745387453874539, + "grad_norm": 0.511058399196522, + "learning_rate": 1.0087580766654983e-06, + "loss": 0.0763, + "num_tokens": 146328963.0, + "step": 1325 + }, + { + "epoch": 9.752767527675276, + "grad_norm": 0.49146225377027963, + "learning_rate": 1.0080975324317925e-06, + "loss": 0.082, + "num_tokens": 146416240.0, + "step": 1326 + }, + { + "epoch": 9.760147601476016, + "grad_norm": 0.7063177174505592, + "learning_rate": 1.0074628614228752e-06, + "loss": 0.1369, + "num_tokens": 146571417.0, + "step": 1327 + }, + { + "epoch": 9.767527675276753, + "grad_norm": 0.5173878371456873, + "learning_rate": 1.0068540672944318e-06, + "loss": 0.4172, + "num_tokens": 146710002.0, + "step": 1328 + }, + { + "epoch": 9.77490774907749, + "grad_norm": 0.5063782782555744, + "learning_rate": 1.0062711535530988e-06, + "loss": 0.0717, + "num_tokens": 146805464.0, + "step": 1329 + }, + { + "epoch": 9.782287822878228, + "grad_norm": 0.5229716807346101, + "learning_rate": 1.0057141235564425e-06, + "loss": 0.0787, + "num_tokens": 146938300.0, + "step": 1330 + }, + { + "epoch": 9.789667896678967, + "grad_norm": 0.4765636960934646, + "learning_rate": 1.005182980512938e-06, + "loss": 0.0759, + "num_tokens": 147084703.0, + "step": 1331 + }, + { + "epoch": 9.797047970479705, + "grad_norm": 0.6123724015309612, + "learning_rate": 1.0046777274819546e-06, + "loss": 0.0992, + "num_tokens": 147198672.0, + "step": 1332 + }, + { + "epoch": 9.804428044280442, + "grad_norm": 0.5011853630785137, + "learning_rate": 1.0041983673737344e-06, + "loss": 0.0793, + "num_tokens": 147296656.0, + "step": 1333 + }, + { + "epoch": 9.811808118081181, + "grad_norm": 0.5179966656872552, + "learning_rate": 1.0037449029493772e-06, + "loss": 0.0712, + "num_tokens": 147385700.0, + "step": 1334 + }, + { + "epoch": 9.819188191881919, + "grad_norm": 0.5532694050161349, + "learning_rate": 1.0033173368208247e-06, + "loss": 0.0688, + "num_tokens": 147492247.0, + "step": 1335 + }, + { + "epoch": 9.826568265682656, + "grad_norm": 0.509334276535671, + "learning_rate": 1.0029156714508453e-06, + "loss": 0.0716, + "num_tokens": 147604983.0, + "step": 1336 + }, + { + "epoch": 9.833948339483396, + "grad_norm": 0.6889262547462951, + "learning_rate": 1.0025399091530194e-06, + "loss": 0.1197, + "num_tokens": 147731969.0, + "step": 1337 + }, + { + "epoch": 9.841328413284133, + "grad_norm": 0.481321432509424, + "learning_rate": 1.0021900520917265e-06, + "loss": 0.0795, + "num_tokens": 147846280.0, + "step": 1338 + }, + { + "epoch": 9.84870848708487, + "grad_norm": 0.4948805761127963, + "learning_rate": 1.001866102282133e-06, + "loss": 0.1026, + "num_tokens": 148000531.0, + "step": 1339 + }, + { + "epoch": 9.85608856088561, + "grad_norm": 0.5472075583755517, + "learning_rate": 1.0015680615901803e-06, + "loss": 0.0928, + "num_tokens": 148099330.0, + "step": 1340 + }, + { + "epoch": 9.863468634686347, + "grad_norm": 0.5123734981559245, + "learning_rate": 1.0012959317325742e-06, + "loss": 0.0863, + "num_tokens": 148216192.0, + "step": 1341 + }, + { + "epoch": 9.870848708487085, + "grad_norm": 0.41053993635286284, + "learning_rate": 1.0010497142767739e-06, + "loss": 0.095, + "num_tokens": 148329601.0, + "step": 1342 + }, + { + "epoch": 9.878228782287822, + "grad_norm": 0.5933041621898189, + "learning_rate": 1.0008294106409856e-06, + "loss": 0.628, + "num_tokens": 148460623.0, + "step": 1343 + }, + { + "epoch": 9.885608856088561, + "grad_norm": 0.5494886437683918, + "learning_rate": 1.0006350220941502e-06, + "loss": 0.0967, + "num_tokens": 148563652.0, + "step": 1344 + }, + { + "epoch": 9.892988929889299, + "grad_norm": 0.5346463232904093, + "learning_rate": 1.0004665497559418e-06, + "loss": 0.095, + "num_tokens": 148656680.0, + "step": 1345 + }, + { + "epoch": 9.900369003690036, + "grad_norm": 0.5253737724244492, + "learning_rate": 1.0003239945967546e-06, + "loss": 0.0884, + "num_tokens": 148754060.0, + "step": 1346 + }, + { + "epoch": 9.907749077490775, + "grad_norm": 0.6504020017793467, + "learning_rate": 1.0002073574377025e-06, + "loss": 0.0869, + "num_tokens": 148830389.0, + "step": 1347 + }, + { + "epoch": 9.915129151291513, + "grad_norm": 0.5903039340675148, + "learning_rate": 1.0001166389506125e-06, + "loss": 0.0853, + "num_tokens": 148922650.0, + "step": 1348 + }, + { + "epoch": 9.92250922509225, + "grad_norm": 0.625114328906451, + "learning_rate": 1.0000518396580204e-06, + "loss": 0.0815, + "num_tokens": 149031999.0, + "step": 1349 + }, + { + "epoch": 9.92988929889299, + "grad_norm": 0.5694763430053924, + "learning_rate": 1.0000129599331674e-06, + "loss": 0.0649, + "num_tokens": 149130555.0, + "step": 1350 + }, + { + "epoch": 9.92988929889299, + "step": 1350, + "total_flos": 8.903987464733983e+18, + "train_loss": 0.4533373955609622, + "train_runtime": 14911.2824, + "train_samples_per_second": 11.626, + "train_steps_per_second": 0.091 + } + ], + "logging_steps": 1, + "max_steps": 1350, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.903987464733983e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}