{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9948293691830403, "eval_steps": 500, "global_step": 1449, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002068252326783868, "grad_norm": 44.96872515263425, "learning_rate": 3.4482758620689656e-07, "loss": 11.5948, "step": 1 }, { "epoch": 0.004136504653567736, "grad_norm": 41.647082611869465, "learning_rate": 6.896551724137931e-07, "loss": 11.7341, "step": 2 }, { "epoch": 0.0062047569803516025, "grad_norm": 49.24245727508304, "learning_rate": 1.0344827586206898e-06, "loss": 11.4163, "step": 3 }, { "epoch": 0.008273009307135471, "grad_norm": 47.02045066890054, "learning_rate": 1.3793103448275862e-06, "loss": 11.5201, "step": 4 }, { "epoch": 0.010341261633919338, "grad_norm": 46.717547649387534, "learning_rate": 1.724137931034483e-06, "loss": 11.4687, "step": 5 }, { "epoch": 0.012409513960703205, "grad_norm": 48.97694838459255, "learning_rate": 2.0689655172413796e-06, "loss": 11.4487, "step": 6 }, { "epoch": 0.014477766287487074, "grad_norm": 39.981577359526604, "learning_rate": 2.413793103448276e-06, "loss": 11.6614, "step": 7 }, { "epoch": 0.016546018614270942, "grad_norm": 53.66105332664603, "learning_rate": 2.7586206896551725e-06, "loss": 11.2516, "step": 8 }, { "epoch": 0.01861427094105481, "grad_norm": 71.88351745884442, "learning_rate": 3.103448275862069e-06, "loss": 10.173, "step": 9 }, { "epoch": 0.020682523267838676, "grad_norm": 79.199847374386, "learning_rate": 3.448275862068966e-06, "loss": 9.8607, "step": 10 }, { "epoch": 0.022750775594622543, "grad_norm": 98.6775989051531, "learning_rate": 3.793103448275862e-06, "loss": 6.0905, "step": 11 }, { "epoch": 0.02481902792140641, "grad_norm": 71.15434906493034, "learning_rate": 4.137931034482759e-06, "loss": 4.6621, "step": 12 }, { "epoch": 0.02688728024819028, "grad_norm": 56.23696056603378, "learning_rate": 4.482758620689655e-06, "loss": 3.6301, "step": 13 }, { "epoch": 0.028955532574974147, "grad_norm": 53.05903658893077, "learning_rate": 4.827586206896552e-06, "loss": 3.8662, "step": 14 }, { "epoch": 0.031023784901758014, "grad_norm": 9.021390111692815, "learning_rate": 5.172413793103448e-06, "loss": 2.0007, "step": 15 }, { "epoch": 0.033092037228541885, "grad_norm": 6.1625193894892005, "learning_rate": 5.517241379310345e-06, "loss": 1.7577, "step": 16 }, { "epoch": 0.03516028955532575, "grad_norm": 5.332880463011889, "learning_rate": 5.862068965517242e-06, "loss": 1.7362, "step": 17 }, { "epoch": 0.03722854188210962, "grad_norm": 3.9757877167151774, "learning_rate": 6.206896551724138e-06, "loss": 1.5479, "step": 18 }, { "epoch": 0.03929679420889348, "grad_norm": 3.4077784357299046, "learning_rate": 6.551724137931035e-06, "loss": 1.7053, "step": 19 }, { "epoch": 0.04136504653567735, "grad_norm": 2.4413998574627485, "learning_rate": 6.896551724137932e-06, "loss": 1.608, "step": 20 }, { "epoch": 0.04343329886246122, "grad_norm": 1.6814007894394627, "learning_rate": 7.241379310344828e-06, "loss": 1.3183, "step": 21 }, { "epoch": 0.045501551189245086, "grad_norm": 8.952799051956468, "learning_rate": 7.586206896551724e-06, "loss": 1.4829, "step": 22 }, { "epoch": 0.047569803516028956, "grad_norm": 1.7481207561725765, "learning_rate": 7.93103448275862e-06, "loss": 1.2574, "step": 23 }, { "epoch": 0.04963805584281282, "grad_norm": 1.277207224408331, "learning_rate": 8.275862068965518e-06, "loss": 1.3497, "step": 24 }, { "epoch": 0.05170630816959669, "grad_norm": 1.0836216712517106, "learning_rate": 8.620689655172414e-06, "loss": 1.4071, "step": 25 }, { "epoch": 0.05377456049638056, "grad_norm": 3.656264369222436, "learning_rate": 8.96551724137931e-06, "loss": 1.3918, "step": 26 }, { "epoch": 0.055842812823164424, "grad_norm": 0.9005853611737152, "learning_rate": 9.310344827586207e-06, "loss": 1.2574, "step": 27 }, { "epoch": 0.057911065149948295, "grad_norm": 1.0471031340398365, "learning_rate": 9.655172413793103e-06, "loss": 1.2518, "step": 28 }, { "epoch": 0.05997931747673216, "grad_norm": 0.7557442283641774, "learning_rate": 1e-05, "loss": 1.1522, "step": 29 }, { "epoch": 0.06204756980351603, "grad_norm": 4.13277104944442, "learning_rate": 1.0344827586206897e-05, "loss": 1.1503, "step": 30 }, { "epoch": 0.0641158221302999, "grad_norm": 1.117626724780643, "learning_rate": 1.0689655172413794e-05, "loss": 0.9365, "step": 31 }, { "epoch": 0.06618407445708377, "grad_norm": 0.7692301663369859, "learning_rate": 1.103448275862069e-05, "loss": 1.1621, "step": 32 }, { "epoch": 0.06825232678386763, "grad_norm": 0.7558531014079825, "learning_rate": 1.1379310344827587e-05, "loss": 1.1833, "step": 33 }, { "epoch": 0.0703205791106515, "grad_norm": 0.6900307266454165, "learning_rate": 1.1724137931034483e-05, "loss": 1.2472, "step": 34 }, { "epoch": 0.07238883143743537, "grad_norm": 0.5623789730528823, "learning_rate": 1.206896551724138e-05, "loss": 1.0175, "step": 35 }, { "epoch": 0.07445708376421924, "grad_norm": 0.6674476541554689, "learning_rate": 1.2413793103448277e-05, "loss": 1.1568, "step": 36 }, { "epoch": 0.07652533609100311, "grad_norm": 0.6017862468555446, "learning_rate": 1.2758620689655174e-05, "loss": 0.9892, "step": 37 }, { "epoch": 0.07859358841778696, "grad_norm": 0.5579439170547498, "learning_rate": 1.310344827586207e-05, "loss": 0.969, "step": 38 }, { "epoch": 0.08066184074457083, "grad_norm": 0.5415690592290014, "learning_rate": 1.3448275862068966e-05, "loss": 0.9386, "step": 39 }, { "epoch": 0.0827300930713547, "grad_norm": 0.5365483631108515, "learning_rate": 1.3793103448275863e-05, "loss": 0.995, "step": 40 }, { "epoch": 0.08479834539813857, "grad_norm": 0.5485590314891456, "learning_rate": 1.4137931034482759e-05, "loss": 1.0493, "step": 41 }, { "epoch": 0.08686659772492245, "grad_norm": 0.4983900136151088, "learning_rate": 1.4482758620689657e-05, "loss": 1.0601, "step": 42 }, { "epoch": 0.0889348500517063, "grad_norm": 0.45371322087277866, "learning_rate": 1.482758620689655e-05, "loss": 1.0685, "step": 43 }, { "epoch": 0.09100310237849017, "grad_norm": 0.5744085516121196, "learning_rate": 1.5172413793103448e-05, "loss": 1.0808, "step": 44 }, { "epoch": 0.09307135470527404, "grad_norm": 0.4058635243746921, "learning_rate": 1.5517241379310346e-05, "loss": 0.9018, "step": 45 }, { "epoch": 0.09513960703205791, "grad_norm": 0.42062526755973595, "learning_rate": 1.586206896551724e-05, "loss": 1.013, "step": 46 }, { "epoch": 0.09720785935884178, "grad_norm": 0.43412059572400985, "learning_rate": 1.6206896551724137e-05, "loss": 0.9948, "step": 47 }, { "epoch": 0.09927611168562564, "grad_norm": 0.45673988106037566, "learning_rate": 1.6551724137931037e-05, "loss": 1.0284, "step": 48 }, { "epoch": 0.10134436401240951, "grad_norm": 0.4101217040316399, "learning_rate": 1.6896551724137932e-05, "loss": 0.9336, "step": 49 }, { "epoch": 0.10341261633919338, "grad_norm": 0.37970006354314173, "learning_rate": 1.7241379310344828e-05, "loss": 0.9868, "step": 50 }, { "epoch": 0.10548086866597725, "grad_norm": 0.4198544876116624, "learning_rate": 1.7586206896551724e-05, "loss": 0.9286, "step": 51 }, { "epoch": 0.10754912099276112, "grad_norm": 0.3642968589309068, "learning_rate": 1.793103448275862e-05, "loss": 0.8992, "step": 52 }, { "epoch": 0.10961737331954498, "grad_norm": 0.3596986837808807, "learning_rate": 1.827586206896552e-05, "loss": 1.008, "step": 53 }, { "epoch": 0.11168562564632885, "grad_norm": 0.35571665201756286, "learning_rate": 1.8620689655172415e-05, "loss": 0.8684, "step": 54 }, { "epoch": 0.11375387797311272, "grad_norm": 0.45465939102190406, "learning_rate": 1.896551724137931e-05, "loss": 1.1253, "step": 55 }, { "epoch": 0.11582213029989659, "grad_norm": 0.3597008937220084, "learning_rate": 1.9310344827586207e-05, "loss": 1.0695, "step": 56 }, { "epoch": 0.11789038262668046, "grad_norm": 0.38098657832433397, "learning_rate": 1.9655172413793106e-05, "loss": 0.8866, "step": 57 }, { "epoch": 0.11995863495346432, "grad_norm": 0.4206253764793995, "learning_rate": 2e-05, "loss": 1.023, "step": 58 }, { "epoch": 0.12202688728024819, "grad_norm": 0.3670120689769908, "learning_rate": 2.0344827586206897e-05, "loss": 0.91, "step": 59 }, { "epoch": 0.12409513960703206, "grad_norm": 0.3875225706167667, "learning_rate": 2.0689655172413793e-05, "loss": 0.9405, "step": 60 }, { "epoch": 0.12616339193381593, "grad_norm": 0.3532877500804614, "learning_rate": 2.1034482758620692e-05, "loss": 0.8618, "step": 61 }, { "epoch": 0.1282316442605998, "grad_norm": 0.32232319523819586, "learning_rate": 2.137931034482759e-05, "loss": 0.8809, "step": 62 }, { "epoch": 0.13029989658738367, "grad_norm": 0.32748372318247615, "learning_rate": 2.1724137931034484e-05, "loss": 0.7538, "step": 63 }, { "epoch": 0.13236814891416754, "grad_norm": 0.3205163419035703, "learning_rate": 2.206896551724138e-05, "loss": 0.791, "step": 64 }, { "epoch": 0.1344364012409514, "grad_norm": 0.35873109237169354, "learning_rate": 2.2413793103448276e-05, "loss": 1.0118, "step": 65 }, { "epoch": 0.13650465356773525, "grad_norm": 0.36769480187508585, "learning_rate": 2.2758620689655175e-05, "loss": 0.8513, "step": 66 }, { "epoch": 0.13857290589451912, "grad_norm": 0.32853865052551146, "learning_rate": 2.3103448275862067e-05, "loss": 0.8009, "step": 67 }, { "epoch": 0.140641158221303, "grad_norm": 0.3647937172236557, "learning_rate": 2.3448275862068967e-05, "loss": 0.8337, "step": 68 }, { "epoch": 0.14270941054808686, "grad_norm": 0.3655455384250789, "learning_rate": 2.3793103448275862e-05, "loss": 0.9302, "step": 69 }, { "epoch": 0.14477766287487073, "grad_norm": 0.3573001283672589, "learning_rate": 2.413793103448276e-05, "loss": 0.9785, "step": 70 }, { "epoch": 0.1468459152016546, "grad_norm": 0.32836466108351314, "learning_rate": 2.4482758620689654e-05, "loss": 0.8437, "step": 71 }, { "epoch": 0.14891416752843847, "grad_norm": 0.31627768486672403, "learning_rate": 2.4827586206896553e-05, "loss": 0.6716, "step": 72 }, { "epoch": 0.15098241985522234, "grad_norm": 0.3479705571925891, "learning_rate": 2.517241379310345e-05, "loss": 0.8769, "step": 73 }, { "epoch": 0.15305067218200621, "grad_norm": 0.32391967784160336, "learning_rate": 2.551724137931035e-05, "loss": 0.8662, "step": 74 }, { "epoch": 0.15511892450879008, "grad_norm": 0.43247171892159614, "learning_rate": 2.5862068965517244e-05, "loss": 0.7038, "step": 75 }, { "epoch": 0.15718717683557393, "grad_norm": 0.3563000943838674, "learning_rate": 2.620689655172414e-05, "loss": 0.9444, "step": 76 }, { "epoch": 0.1592554291623578, "grad_norm": 0.40323735992267545, "learning_rate": 2.6551724137931032e-05, "loss": 0.9809, "step": 77 }, { "epoch": 0.16132368148914167, "grad_norm": 0.32813525138837746, "learning_rate": 2.689655172413793e-05, "loss": 0.8754, "step": 78 }, { "epoch": 0.16339193381592554, "grad_norm": 0.31386709466569235, "learning_rate": 2.7241379310344827e-05, "loss": 0.8043, "step": 79 }, { "epoch": 0.1654601861427094, "grad_norm": 0.33090008693201167, "learning_rate": 2.7586206896551727e-05, "loss": 0.8851, "step": 80 }, { "epoch": 0.16752843846949328, "grad_norm": 0.30880410645832673, "learning_rate": 2.7931034482758622e-05, "loss": 0.8511, "step": 81 }, { "epoch": 0.16959669079627715, "grad_norm": 0.3966593167364094, "learning_rate": 2.8275862068965518e-05, "loss": 0.9353, "step": 82 }, { "epoch": 0.17166494312306102, "grad_norm": 0.3517025144623485, "learning_rate": 2.8620689655172417e-05, "loss": 0.7929, "step": 83 }, { "epoch": 0.1737331954498449, "grad_norm": 0.28898821922606377, "learning_rate": 2.8965517241379313e-05, "loss": 0.7838, "step": 84 }, { "epoch": 0.17580144777662876, "grad_norm": 0.44074049987939334, "learning_rate": 2.9310344827586206e-05, "loss": 0.8891, "step": 85 }, { "epoch": 0.1778697001034126, "grad_norm": 0.32107919578931204, "learning_rate": 2.96551724137931e-05, "loss": 0.8479, "step": 86 }, { "epoch": 0.17993795243019647, "grad_norm": 0.36772323722125616, "learning_rate": 3e-05, "loss": 0.8064, "step": 87 }, { "epoch": 0.18200620475698034, "grad_norm": 0.27643945336667447, "learning_rate": 3.0344827586206897e-05, "loss": 0.749, "step": 88 }, { "epoch": 0.18407445708376421, "grad_norm": 0.3366242147335627, "learning_rate": 3.0689655172413796e-05, "loss": 0.7727, "step": 89 }, { "epoch": 0.18614270941054809, "grad_norm": 0.3174419057025173, "learning_rate": 3.103448275862069e-05, "loss": 0.9092, "step": 90 }, { "epoch": 0.18821096173733196, "grad_norm": 0.34293894916994744, "learning_rate": 3.137931034482759e-05, "loss": 0.8168, "step": 91 }, { "epoch": 0.19027921406411583, "grad_norm": 0.32449771750114675, "learning_rate": 3.172413793103448e-05, "loss": 0.8577, "step": 92 }, { "epoch": 0.1923474663908997, "grad_norm": 0.6090328597245179, "learning_rate": 3.206896551724138e-05, "loss": 0.7706, "step": 93 }, { "epoch": 0.19441571871768357, "grad_norm": 0.34573587325597416, "learning_rate": 3.2413793103448275e-05, "loss": 0.8454, "step": 94 }, { "epoch": 0.19648397104446744, "grad_norm": 0.2939832188805289, "learning_rate": 3.275862068965517e-05, "loss": 0.7753, "step": 95 }, { "epoch": 0.19855222337125128, "grad_norm": 0.3245900451122045, "learning_rate": 3.310344827586207e-05, "loss": 0.7473, "step": 96 }, { "epoch": 0.20062047569803515, "grad_norm": 0.37124414797073296, "learning_rate": 3.344827586206897e-05, "loss": 0.783, "step": 97 }, { "epoch": 0.20268872802481902, "grad_norm": 0.33450211579347927, "learning_rate": 3.3793103448275865e-05, "loss": 0.8364, "step": 98 }, { "epoch": 0.2047569803516029, "grad_norm": 0.9227562028595444, "learning_rate": 3.413793103448276e-05, "loss": 0.8069, "step": 99 }, { "epoch": 0.20682523267838676, "grad_norm": 0.45275558610640254, "learning_rate": 3.4482758620689657e-05, "loss": 0.9529, "step": 100 }, { "epoch": 0.20889348500517063, "grad_norm": 0.43082254996969993, "learning_rate": 3.482758620689655e-05, "loss": 0.7501, "step": 101 }, { "epoch": 0.2109617373319545, "grad_norm": 0.4602977655016315, "learning_rate": 3.517241379310345e-05, "loss": 0.8991, "step": 102 }, { "epoch": 0.21302998965873837, "grad_norm": 0.3641785658942789, "learning_rate": 3.5517241379310344e-05, "loss": 0.7837, "step": 103 }, { "epoch": 0.21509824198552224, "grad_norm": 0.5469631355246571, "learning_rate": 3.586206896551724e-05, "loss": 0.9701, "step": 104 }, { "epoch": 0.2171664943123061, "grad_norm": 0.4383562444128941, "learning_rate": 3.620689655172414e-05, "loss": 0.7233, "step": 105 }, { "epoch": 0.21923474663908996, "grad_norm": 0.5250127428756021, "learning_rate": 3.655172413793104e-05, "loss": 0.8575, "step": 106 }, { "epoch": 0.22130299896587383, "grad_norm": 0.5759624531089934, "learning_rate": 3.6896551724137934e-05, "loss": 0.7337, "step": 107 }, { "epoch": 0.2233712512926577, "grad_norm": 0.445117989634426, "learning_rate": 3.724137931034483e-05, "loss": 0.8441, "step": 108 }, { "epoch": 0.22543950361944157, "grad_norm": 0.6384313883045873, "learning_rate": 3.7586206896551726e-05, "loss": 0.7303, "step": 109 }, { "epoch": 0.22750775594622544, "grad_norm": 0.41708229701746036, "learning_rate": 3.793103448275862e-05, "loss": 0.878, "step": 110 }, { "epoch": 0.2295760082730093, "grad_norm": 0.7264816402770112, "learning_rate": 3.827586206896552e-05, "loss": 0.8115, "step": 111 }, { "epoch": 0.23164426059979318, "grad_norm": 0.40294398363956946, "learning_rate": 3.862068965517241e-05, "loss": 0.7859, "step": 112 }, { "epoch": 0.23371251292657705, "grad_norm": 0.7730710826332048, "learning_rate": 3.896551724137931e-05, "loss": 0.8698, "step": 113 }, { "epoch": 0.23578076525336092, "grad_norm": 0.5210489124717299, "learning_rate": 3.931034482758621e-05, "loss": 0.721, "step": 114 }, { "epoch": 0.2378490175801448, "grad_norm": 0.6222824133348206, "learning_rate": 3.965517241379311e-05, "loss": 0.7424, "step": 115 }, { "epoch": 0.23991726990692863, "grad_norm": 0.5271132956642613, "learning_rate": 4e-05, "loss": 0.8161, "step": 116 }, { "epoch": 0.2419855222337125, "grad_norm": 1.0332751501970656, "learning_rate": 4.03448275862069e-05, "loss": 0.7148, "step": 117 }, { "epoch": 0.24405377456049637, "grad_norm": 0.5873093604518775, "learning_rate": 4.0689655172413795e-05, "loss": 0.9579, "step": 118 }, { "epoch": 0.24612202688728024, "grad_norm": 0.5755604244992475, "learning_rate": 4.103448275862069e-05, "loss": 0.7769, "step": 119 }, { "epoch": 0.2481902792140641, "grad_norm": 0.40588352580252446, "learning_rate": 4.1379310344827587e-05, "loss": 0.7213, "step": 120 }, { "epoch": 0.250258531540848, "grad_norm": 0.5873575280311084, "learning_rate": 4.172413793103448e-05, "loss": 0.7751, "step": 121 }, { "epoch": 0.25232678386763185, "grad_norm": 0.4510546423140904, "learning_rate": 4.2068965517241385e-05, "loss": 0.7747, "step": 122 }, { "epoch": 0.2543950361944157, "grad_norm": 0.5963498397216683, "learning_rate": 4.241379310344828e-05, "loss": 0.797, "step": 123 }, { "epoch": 0.2564632885211996, "grad_norm": 0.7330779858409497, "learning_rate": 4.275862068965518e-05, "loss": 0.6579, "step": 124 }, { "epoch": 0.25853154084798347, "grad_norm": 0.45188768835870213, "learning_rate": 4.3103448275862066e-05, "loss": 0.8109, "step": 125 }, { "epoch": 0.26059979317476734, "grad_norm": 0.7975003867977893, "learning_rate": 4.344827586206897e-05, "loss": 0.8154, "step": 126 }, { "epoch": 0.2626680455015512, "grad_norm": 0.4817770160026935, "learning_rate": 4.3793103448275864e-05, "loss": 0.8655, "step": 127 }, { "epoch": 0.2647362978283351, "grad_norm": 0.6537022627279404, "learning_rate": 4.413793103448276e-05, "loss": 0.7041, "step": 128 }, { "epoch": 0.26680455015511895, "grad_norm": 0.5482642050380746, "learning_rate": 4.4482758620689656e-05, "loss": 0.777, "step": 129 }, { "epoch": 0.2688728024819028, "grad_norm": 0.5392566566589209, "learning_rate": 4.482758620689655e-05, "loss": 0.6765, "step": 130 }, { "epoch": 0.27094105480868663, "grad_norm": 0.5303321230860171, "learning_rate": 4.5172413793103454e-05, "loss": 0.7078, "step": 131 }, { "epoch": 0.2730093071354705, "grad_norm": 0.5252983529010756, "learning_rate": 4.551724137931035e-05, "loss": 0.916, "step": 132 }, { "epoch": 0.2750775594622544, "grad_norm": 0.7481525892421227, "learning_rate": 4.586206896551724e-05, "loss": 0.8451, "step": 133 }, { "epoch": 0.27714581178903824, "grad_norm": 0.5917652724961572, "learning_rate": 4.6206896551724135e-05, "loss": 0.7762, "step": 134 }, { "epoch": 0.2792140641158221, "grad_norm": 0.5438028629289449, "learning_rate": 4.655172413793104e-05, "loss": 0.8146, "step": 135 }, { "epoch": 0.281282316442606, "grad_norm": 4.112345840607704, "learning_rate": 4.689655172413793e-05, "loss": 0.9148, "step": 136 }, { "epoch": 0.28335056876938985, "grad_norm": 1.259146413744445, "learning_rate": 4.724137931034483e-05, "loss": 0.7543, "step": 137 }, { "epoch": 0.2854188210961737, "grad_norm": 0.506195684769057, "learning_rate": 4.7586206896551725e-05, "loss": 0.82, "step": 138 }, { "epoch": 0.2874870734229576, "grad_norm": 1.059039981115955, "learning_rate": 4.793103448275863e-05, "loss": 0.7308, "step": 139 }, { "epoch": 0.28955532574974147, "grad_norm": 0.6377758365788767, "learning_rate": 4.827586206896552e-05, "loss": 0.7458, "step": 140 }, { "epoch": 0.29162357807652534, "grad_norm": 0.7456188116492265, "learning_rate": 4.862068965517241e-05, "loss": 0.7313, "step": 141 }, { "epoch": 0.2936918304033092, "grad_norm": 0.7086427981181223, "learning_rate": 4.896551724137931e-05, "loss": 0.8129, "step": 142 }, { "epoch": 0.2957600827300931, "grad_norm": 0.6133999531367277, "learning_rate": 4.931034482758621e-05, "loss": 0.7435, "step": 143 }, { "epoch": 0.29782833505687695, "grad_norm": 0.7889066188317254, "learning_rate": 4.9655172413793107e-05, "loss": 0.8142, "step": 144 }, { "epoch": 0.2998965873836608, "grad_norm": 0.5122477211648202, "learning_rate": 5e-05, "loss": 0.6945, "step": 145 }, { "epoch": 0.3019648397104447, "grad_norm": 0.6230438685377213, "learning_rate": 4.996165644171779e-05, "loss": 0.7548, "step": 146 }, { "epoch": 0.30403309203722856, "grad_norm": 0.6082578314299645, "learning_rate": 4.992331288343559e-05, "loss": 0.7297, "step": 147 }, { "epoch": 0.30610134436401243, "grad_norm": 0.5341727488516114, "learning_rate": 4.988496932515337e-05, "loss": 0.7727, "step": 148 }, { "epoch": 0.3081695966907963, "grad_norm": 0.5175317681624382, "learning_rate": 4.984662576687117e-05, "loss": 0.6388, "step": 149 }, { "epoch": 0.31023784901758017, "grad_norm": 0.451323871631322, "learning_rate": 4.980828220858896e-05, "loss": 0.6615, "step": 150 }, { "epoch": 0.31230610134436404, "grad_norm": 0.6724906857420004, "learning_rate": 4.976993865030675e-05, "loss": 0.7838, "step": 151 }, { "epoch": 0.31437435367114785, "grad_norm": 0.4664280561214099, "learning_rate": 4.973159509202454e-05, "loss": 0.8555, "step": 152 }, { "epoch": 0.3164426059979317, "grad_norm": 0.6268705414327511, "learning_rate": 4.9693251533742335e-05, "loss": 0.8137, "step": 153 }, { "epoch": 0.3185108583247156, "grad_norm": 0.41287071362780153, "learning_rate": 4.965490797546013e-05, "loss": 0.7615, "step": 154 }, { "epoch": 0.32057911065149947, "grad_norm": 0.5763963307191156, "learning_rate": 4.961656441717792e-05, "loss": 0.674, "step": 155 }, { "epoch": 0.32264736297828334, "grad_norm": 0.4187367934382774, "learning_rate": 4.9578220858895705e-05, "loss": 0.6485, "step": 156 }, { "epoch": 0.3247156153050672, "grad_norm": 2.670736456207289, "learning_rate": 4.9539877300613504e-05, "loss": 0.9569, "step": 157 }, { "epoch": 0.3267838676318511, "grad_norm": 0.6935718311486438, "learning_rate": 4.950153374233129e-05, "loss": 0.8001, "step": 158 }, { "epoch": 0.32885211995863495, "grad_norm": 0.617960610915813, "learning_rate": 4.946319018404908e-05, "loss": 0.6782, "step": 159 }, { "epoch": 0.3309203722854188, "grad_norm": 0.6865187300605095, "learning_rate": 4.9424846625766875e-05, "loss": 0.7106, "step": 160 }, { "epoch": 0.3329886246122027, "grad_norm": 0.4754490481671121, "learning_rate": 4.938650306748466e-05, "loss": 0.7631, "step": 161 }, { "epoch": 0.33505687693898656, "grad_norm": 0.6280987969029226, "learning_rate": 4.934815950920246e-05, "loss": 0.8351, "step": 162 }, { "epoch": 0.33712512926577043, "grad_norm": 0.5245762959140692, "learning_rate": 4.9309815950920245e-05, "loss": 0.7443, "step": 163 }, { "epoch": 0.3391933815925543, "grad_norm": 1.1736601256101928, "learning_rate": 4.927147239263804e-05, "loss": 0.8406, "step": 164 }, { "epoch": 0.34126163391933817, "grad_norm": 0.5900443083999614, "learning_rate": 4.923312883435583e-05, "loss": 0.7347, "step": 165 }, { "epoch": 0.34332988624612204, "grad_norm": 0.6438078323218407, "learning_rate": 4.919478527607362e-05, "loss": 0.75, "step": 166 }, { "epoch": 0.3453981385729059, "grad_norm": 0.6648231289404565, "learning_rate": 4.9156441717791415e-05, "loss": 0.8414, "step": 167 }, { "epoch": 0.3474663908996898, "grad_norm": 0.46590800016858336, "learning_rate": 4.911809815950921e-05, "loss": 0.7943, "step": 168 }, { "epoch": 0.34953464322647365, "grad_norm": 0.6630314136704358, "learning_rate": 4.907975460122699e-05, "loss": 0.6556, "step": 169 }, { "epoch": 0.3516028955532575, "grad_norm": 0.4176618450135697, "learning_rate": 4.904141104294479e-05, "loss": 0.7437, "step": 170 }, { "epoch": 0.3536711478800414, "grad_norm": 0.7146307315484626, "learning_rate": 4.900306748466258e-05, "loss": 0.7829, "step": 171 }, { "epoch": 0.3557394002068252, "grad_norm": 0.5186169099094696, "learning_rate": 4.896472392638037e-05, "loss": 0.7277, "step": 172 }, { "epoch": 0.3578076525336091, "grad_norm": 0.593308461665732, "learning_rate": 4.892638036809816e-05, "loss": 0.6777, "step": 173 }, { "epoch": 0.35987590486039295, "grad_norm": 0.5730122128492083, "learning_rate": 4.8888036809815955e-05, "loss": 0.859, "step": 174 }, { "epoch": 0.3619441571871768, "grad_norm": 0.9222271808541631, "learning_rate": 4.884969325153375e-05, "loss": 0.7608, "step": 175 }, { "epoch": 0.3640124095139607, "grad_norm": 0.5863504348483816, "learning_rate": 4.881134969325153e-05, "loss": 0.7135, "step": 176 }, { "epoch": 0.36608066184074456, "grad_norm": 0.4588493609600725, "learning_rate": 4.877300613496933e-05, "loss": 0.7737, "step": 177 }, { "epoch": 0.36814891416752843, "grad_norm": 0.6780039071940922, "learning_rate": 4.873466257668712e-05, "loss": 0.7524, "step": 178 }, { "epoch": 0.3702171664943123, "grad_norm": 0.37281788282421974, "learning_rate": 4.869631901840491e-05, "loss": 0.5925, "step": 179 }, { "epoch": 0.37228541882109617, "grad_norm": 0.7890020613734764, "learning_rate": 4.86579754601227e-05, "loss": 0.6941, "step": 180 }, { "epoch": 0.37435367114788004, "grad_norm": 0.5910579961950915, "learning_rate": 4.8619631901840495e-05, "loss": 0.7596, "step": 181 }, { "epoch": 0.3764219234746639, "grad_norm": 0.5074699770077028, "learning_rate": 4.858128834355829e-05, "loss": 0.8553, "step": 182 }, { "epoch": 0.3784901758014478, "grad_norm": 0.5985200516393926, "learning_rate": 4.854294478527607e-05, "loss": 0.6354, "step": 183 }, { "epoch": 0.38055842812823165, "grad_norm": 0.4995672957490536, "learning_rate": 4.8504601226993865e-05, "loss": 0.7309, "step": 184 }, { "epoch": 0.3826266804550155, "grad_norm": 0.5359345819715812, "learning_rate": 4.846625766871166e-05, "loss": 0.6091, "step": 185 }, { "epoch": 0.3846949327817994, "grad_norm": 0.5008868562485119, "learning_rate": 4.842791411042945e-05, "loss": 0.6372, "step": 186 }, { "epoch": 0.38676318510858326, "grad_norm": 1.2587451790801742, "learning_rate": 4.838957055214724e-05, "loss": 0.6924, "step": 187 }, { "epoch": 0.38883143743536713, "grad_norm": 0.5425753902333572, "learning_rate": 4.8351226993865035e-05, "loss": 0.8073, "step": 188 }, { "epoch": 0.390899689762151, "grad_norm": 0.54199059158459, "learning_rate": 4.831288343558282e-05, "loss": 0.8001, "step": 189 }, { "epoch": 0.3929679420889349, "grad_norm": 0.41140267245221085, "learning_rate": 4.827453987730062e-05, "loss": 0.7092, "step": 190 }, { "epoch": 0.39503619441571874, "grad_norm": 0.5271352938785425, "learning_rate": 4.8236196319018405e-05, "loss": 0.8288, "step": 191 }, { "epoch": 0.39710444674250256, "grad_norm": 0.7798082460891234, "learning_rate": 4.81978527607362e-05, "loss": 0.7135, "step": 192 }, { "epoch": 0.39917269906928643, "grad_norm": 0.48212956739732693, "learning_rate": 4.815950920245399e-05, "loss": 0.7402, "step": 193 }, { "epoch": 0.4012409513960703, "grad_norm": 0.5022078025881384, "learning_rate": 4.812116564417178e-05, "loss": 0.6594, "step": 194 }, { "epoch": 0.40330920372285417, "grad_norm": 0.4527358957876632, "learning_rate": 4.8082822085889575e-05, "loss": 0.698, "step": 195 }, { "epoch": 0.40537745604963804, "grad_norm": 0.6263639572647242, "learning_rate": 4.804447852760736e-05, "loss": 0.7658, "step": 196 }, { "epoch": 0.4074457083764219, "grad_norm": 1.3805266948840713, "learning_rate": 4.800613496932516e-05, "loss": 0.7531, "step": 197 }, { "epoch": 0.4095139607032058, "grad_norm": 0.4476854242713903, "learning_rate": 4.7967791411042945e-05, "loss": 0.782, "step": 198 }, { "epoch": 0.41158221302998965, "grad_norm": 0.7567601288850552, "learning_rate": 4.792944785276074e-05, "loss": 0.7418, "step": 199 }, { "epoch": 0.4136504653567735, "grad_norm": 0.7905614781320054, "learning_rate": 4.789110429447853e-05, "loss": 0.703, "step": 200 }, { "epoch": 0.4157187176835574, "grad_norm": 0.504353615699897, "learning_rate": 4.785276073619632e-05, "loss": 0.7323, "step": 201 }, { "epoch": 0.41778697001034126, "grad_norm": 0.582734598797599, "learning_rate": 4.7814417177914114e-05, "loss": 0.758, "step": 202 }, { "epoch": 0.41985522233712513, "grad_norm": 0.7700433336759996, "learning_rate": 4.777607361963191e-05, "loss": 0.7072, "step": 203 }, { "epoch": 0.421923474663909, "grad_norm": 0.8455917741382415, "learning_rate": 4.773773006134969e-05, "loss": 0.769, "step": 204 }, { "epoch": 0.4239917269906929, "grad_norm": 0.5053968492474978, "learning_rate": 4.769938650306749e-05, "loss": 0.697, "step": 205 }, { "epoch": 0.42605997931747674, "grad_norm": 0.6836736842696958, "learning_rate": 4.766104294478528e-05, "loss": 0.7752, "step": 206 }, { "epoch": 0.4281282316442606, "grad_norm": 0.5778816154336093, "learning_rate": 4.762269938650307e-05, "loss": 0.6663, "step": 207 }, { "epoch": 0.4301964839710445, "grad_norm": 1.0688523887820445, "learning_rate": 4.758435582822086e-05, "loss": 0.6876, "step": 208 }, { "epoch": 0.43226473629782836, "grad_norm": 0.9108235495992374, "learning_rate": 4.754601226993865e-05, "loss": 0.8986, "step": 209 }, { "epoch": 0.4343329886246122, "grad_norm": 0.664498063900217, "learning_rate": 4.750766871165645e-05, "loss": 0.6144, "step": 210 }, { "epoch": 0.4364012409513961, "grad_norm": 0.7129764454396, "learning_rate": 4.746932515337423e-05, "loss": 0.7029, "step": 211 }, { "epoch": 0.4384694932781799, "grad_norm": 0.5230879417849138, "learning_rate": 4.7430981595092025e-05, "loss": 0.7287, "step": 212 }, { "epoch": 0.4405377456049638, "grad_norm": 0.5803256828446165, "learning_rate": 4.739263803680982e-05, "loss": 0.6718, "step": 213 }, { "epoch": 0.44260599793174765, "grad_norm": 0.4621872100510366, "learning_rate": 4.735429447852761e-05, "loss": 0.6125, "step": 214 }, { "epoch": 0.4446742502585315, "grad_norm": 0.536713491212011, "learning_rate": 4.73159509202454e-05, "loss": 0.7379, "step": 215 }, { "epoch": 0.4467425025853154, "grad_norm": 0.5341787532833311, "learning_rate": 4.7277607361963194e-05, "loss": 0.819, "step": 216 }, { "epoch": 0.44881075491209926, "grad_norm": 0.5100067379690079, "learning_rate": 4.723926380368098e-05, "loss": 0.7573, "step": 217 }, { "epoch": 0.45087900723888313, "grad_norm": 0.43494883601639, "learning_rate": 4.720092024539878e-05, "loss": 0.8001, "step": 218 }, { "epoch": 0.452947259565667, "grad_norm": 0.4663560719691748, "learning_rate": 4.7162576687116565e-05, "loss": 0.8028, "step": 219 }, { "epoch": 0.4550155118924509, "grad_norm": 0.3727192555606306, "learning_rate": 4.7124233128834364e-05, "loss": 0.6883, "step": 220 }, { "epoch": 0.45708376421923474, "grad_norm": 0.47316790112623164, "learning_rate": 4.708588957055215e-05, "loss": 0.7411, "step": 221 }, { "epoch": 0.4591520165460186, "grad_norm": 0.31910453451052295, "learning_rate": 4.7047546012269935e-05, "loss": 0.7228, "step": 222 }, { "epoch": 0.4612202688728025, "grad_norm": 0.5495866922698741, "learning_rate": 4.7009202453987734e-05, "loss": 0.7403, "step": 223 }, { "epoch": 0.46328852119958636, "grad_norm": 0.3903265320417465, "learning_rate": 4.697085889570552e-05, "loss": 0.6694, "step": 224 }, { "epoch": 0.4653567735263702, "grad_norm": 0.4360910963284313, "learning_rate": 4.693251533742332e-05, "loss": 0.65, "step": 225 }, { "epoch": 0.4674250258531541, "grad_norm": 0.7524648583164402, "learning_rate": 4.6894171779141105e-05, "loss": 0.7772, "step": 226 }, { "epoch": 0.46949327817993797, "grad_norm": 0.43292144544007294, "learning_rate": 4.68558282208589e-05, "loss": 0.6935, "step": 227 }, { "epoch": 0.47156153050672184, "grad_norm": 0.5582719387528962, "learning_rate": 4.681748466257669e-05, "loss": 0.7543, "step": 228 }, { "epoch": 0.4736297828335057, "grad_norm": 2.0692272926110418, "learning_rate": 4.677914110429448e-05, "loss": 0.8669, "step": 229 }, { "epoch": 0.4756980351602896, "grad_norm": 0.9114187050910966, "learning_rate": 4.6740797546012274e-05, "loss": 0.5469, "step": 230 }, { "epoch": 0.47776628748707345, "grad_norm": 0.46380271311760257, "learning_rate": 4.670245398773007e-05, "loss": 0.6045, "step": 231 }, { "epoch": 0.47983453981385726, "grad_norm": 0.43816681277871933, "learning_rate": 4.666411042944785e-05, "loss": 0.7006, "step": 232 }, { "epoch": 0.48190279214064113, "grad_norm": 0.45852848576898625, "learning_rate": 4.6625766871165645e-05, "loss": 0.7475, "step": 233 }, { "epoch": 0.483971044467425, "grad_norm": 0.5175650905939856, "learning_rate": 4.658742331288344e-05, "loss": 0.697, "step": 234 }, { "epoch": 0.4860392967942089, "grad_norm": 0.42340967739418944, "learning_rate": 4.654907975460123e-05, "loss": 0.678, "step": 235 }, { "epoch": 0.48810754912099275, "grad_norm": 0.4622668403794398, "learning_rate": 4.651073619631902e-05, "loss": 0.6179, "step": 236 }, { "epoch": 0.4901758014477766, "grad_norm": 0.444243681993541, "learning_rate": 4.647239263803681e-05, "loss": 0.6376, "step": 237 }, { "epoch": 0.4922440537745605, "grad_norm": 0.5373395811127366, "learning_rate": 4.643404907975461e-05, "loss": 0.8104, "step": 238 }, { "epoch": 0.49431230610134436, "grad_norm": 0.43563853393915747, "learning_rate": 4.639570552147239e-05, "loss": 0.7064, "step": 239 }, { "epoch": 0.4963805584281282, "grad_norm": 0.5026858328027761, "learning_rate": 4.6357361963190185e-05, "loss": 0.7152, "step": 240 }, { "epoch": 0.4984488107549121, "grad_norm": 0.7551462923479503, "learning_rate": 4.631901840490798e-05, "loss": 0.7128, "step": 241 }, { "epoch": 0.500517063081696, "grad_norm": 0.5716696189724575, "learning_rate": 4.628067484662577e-05, "loss": 0.7358, "step": 242 }, { "epoch": 0.5025853154084798, "grad_norm": 0.44354219059461464, "learning_rate": 4.624233128834356e-05, "loss": 0.7642, "step": 243 }, { "epoch": 0.5046535677352637, "grad_norm": 0.5517699485676673, "learning_rate": 4.6203987730061354e-05, "loss": 0.7202, "step": 244 }, { "epoch": 0.5067218200620476, "grad_norm": 0.39208990344280686, "learning_rate": 4.616564417177914e-05, "loss": 0.6876, "step": 245 }, { "epoch": 0.5087900723888314, "grad_norm": 0.46109383077249627, "learning_rate": 4.612730061349693e-05, "loss": 0.6113, "step": 246 }, { "epoch": 0.5108583247156153, "grad_norm": 0.3914021679873914, "learning_rate": 4.6088957055214725e-05, "loss": 0.7069, "step": 247 }, { "epoch": 0.5129265770423992, "grad_norm": 0.44058463287853394, "learning_rate": 4.605061349693252e-05, "loss": 0.6954, "step": 248 }, { "epoch": 0.5149948293691831, "grad_norm": 0.44102679660405025, "learning_rate": 4.601226993865031e-05, "loss": 0.6605, "step": 249 }, { "epoch": 0.5170630816959669, "grad_norm": 0.37545440553434223, "learning_rate": 4.59739263803681e-05, "loss": 0.6926, "step": 250 }, { "epoch": 0.5191313340227508, "grad_norm": 0.5374758614268441, "learning_rate": 4.5935582822085894e-05, "loss": 0.8343, "step": 251 }, { "epoch": 0.5211995863495347, "grad_norm": 0.38171688553120464, "learning_rate": 4.589723926380368e-05, "loss": 0.7152, "step": 252 }, { "epoch": 0.5232678386763185, "grad_norm": 0.4570757364693712, "learning_rate": 4.585889570552148e-05, "loss": 0.6548, "step": 253 }, { "epoch": 0.5253360910031024, "grad_norm": 0.4654305372870469, "learning_rate": 4.5820552147239265e-05, "loss": 0.6861, "step": 254 }, { "epoch": 0.5274043433298863, "grad_norm": 0.3858774375652063, "learning_rate": 4.578220858895706e-05, "loss": 0.6865, "step": 255 }, { "epoch": 0.5294725956566702, "grad_norm": 0.5550862926568952, "learning_rate": 4.574386503067485e-05, "loss": 0.7392, "step": 256 }, { "epoch": 0.531540847983454, "grad_norm": 0.40770971291129593, "learning_rate": 4.570552147239264e-05, "loss": 0.6866, "step": 257 }, { "epoch": 0.5336091003102379, "grad_norm": 0.439560660722042, "learning_rate": 4.5667177914110434e-05, "loss": 0.7956, "step": 258 }, { "epoch": 0.5356773526370218, "grad_norm": 0.4320621188178195, "learning_rate": 4.562883435582822e-05, "loss": 0.7029, "step": 259 }, { "epoch": 0.5377456049638056, "grad_norm": 0.3089959436850672, "learning_rate": 4.559049079754601e-05, "loss": 0.5919, "step": 260 }, { "epoch": 0.5398138572905895, "grad_norm": 2.7056160580439785, "learning_rate": 4.5552147239263805e-05, "loss": 0.7129, "step": 261 }, { "epoch": 0.5418821096173733, "grad_norm": 0.5456111335517104, "learning_rate": 4.55138036809816e-05, "loss": 0.6728, "step": 262 }, { "epoch": 0.5439503619441571, "grad_norm": 0.2920072427921721, "learning_rate": 4.547546012269939e-05, "loss": 0.5772, "step": 263 }, { "epoch": 0.546018614270941, "grad_norm": 0.5205673546761084, "learning_rate": 4.543711656441718e-05, "loss": 0.7565, "step": 264 }, { "epoch": 0.5480868665977249, "grad_norm": 0.399997839069775, "learning_rate": 4.539877300613497e-05, "loss": 0.6698, "step": 265 }, { "epoch": 0.5501551189245087, "grad_norm": 0.39082345446152333, "learning_rate": 4.5360429447852767e-05, "loss": 0.805, "step": 266 }, { "epoch": 0.5522233712512926, "grad_norm": 0.47449324026780326, "learning_rate": 4.532208588957055e-05, "loss": 0.7074, "step": 267 }, { "epoch": 0.5542916235780765, "grad_norm": 0.5580104344018255, "learning_rate": 4.528374233128835e-05, "loss": 0.825, "step": 268 }, { "epoch": 0.5563598759048604, "grad_norm": 0.47258425561040823, "learning_rate": 4.524539877300614e-05, "loss": 0.6616, "step": 269 }, { "epoch": 0.5584281282316442, "grad_norm": 0.47226138278240776, "learning_rate": 4.520705521472393e-05, "loss": 0.6288, "step": 270 }, { "epoch": 0.5604963805584281, "grad_norm": 0.4397573301476966, "learning_rate": 4.516871165644172e-05, "loss": 0.6556, "step": 271 }, { "epoch": 0.562564632885212, "grad_norm": 0.5242890218430927, "learning_rate": 4.513036809815951e-05, "loss": 0.6605, "step": 272 }, { "epoch": 0.5646328852119958, "grad_norm": 0.6581319648656121, "learning_rate": 4.5092024539877307e-05, "loss": 0.7815, "step": 273 }, { "epoch": 0.5667011375387797, "grad_norm": 0.3959968683917134, "learning_rate": 4.505368098159509e-05, "loss": 0.6457, "step": 274 }, { "epoch": 0.5687693898655636, "grad_norm": 0.6273410465952522, "learning_rate": 4.5015337423312885e-05, "loss": 0.6433, "step": 275 }, { "epoch": 0.5708376421923474, "grad_norm": 0.4253556860453283, "learning_rate": 4.497699386503068e-05, "loss": 0.6674, "step": 276 }, { "epoch": 0.5729058945191313, "grad_norm": 0.5398077952066184, "learning_rate": 4.493865030674847e-05, "loss": 0.6441, "step": 277 }, { "epoch": 0.5749741468459152, "grad_norm": 0.40290404966563503, "learning_rate": 4.490030674846626e-05, "loss": 0.7933, "step": 278 }, { "epoch": 0.5770423991726991, "grad_norm": 0.5069221476751035, "learning_rate": 4.4861963190184054e-05, "loss": 0.6089, "step": 279 }, { "epoch": 0.5791106514994829, "grad_norm": 0.4376610639381264, "learning_rate": 4.482361963190184e-05, "loss": 0.6929, "step": 280 }, { "epoch": 0.5811789038262668, "grad_norm": 0.4852639615780228, "learning_rate": 4.478527607361964e-05, "loss": 0.7772, "step": 281 }, { "epoch": 0.5832471561530507, "grad_norm": 0.406139807784322, "learning_rate": 4.4746932515337424e-05, "loss": 0.6464, "step": 282 }, { "epoch": 0.5853154084798345, "grad_norm": 0.4684397454157715, "learning_rate": 4.470858895705522e-05, "loss": 0.6742, "step": 283 }, { "epoch": 0.5873836608066184, "grad_norm": 0.300579543324092, "learning_rate": 4.467024539877301e-05, "loss": 0.6256, "step": 284 }, { "epoch": 0.5894519131334023, "grad_norm": 0.471434574171312, "learning_rate": 4.4631901840490795e-05, "loss": 0.617, "step": 285 }, { "epoch": 0.5915201654601862, "grad_norm": 0.3688738949047048, "learning_rate": 4.4593558282208594e-05, "loss": 0.6222, "step": 286 }, { "epoch": 0.59358841778697, "grad_norm": 0.5880791376742642, "learning_rate": 4.455521472392638e-05, "loss": 0.5795, "step": 287 }, { "epoch": 0.5956566701137539, "grad_norm": 0.3876548870513828, "learning_rate": 4.451687116564417e-05, "loss": 0.7123, "step": 288 }, { "epoch": 0.5977249224405378, "grad_norm": 0.40168166297765207, "learning_rate": 4.4478527607361964e-05, "loss": 0.7792, "step": 289 }, { "epoch": 0.5997931747673216, "grad_norm": 0.42910287969464556, "learning_rate": 4.444018404907976e-05, "loss": 0.6425, "step": 290 }, { "epoch": 0.6018614270941055, "grad_norm": 2.699402176424409, "learning_rate": 4.440184049079755e-05, "loss": 0.7461, "step": 291 }, { "epoch": 0.6039296794208894, "grad_norm": 0.4231091636268457, "learning_rate": 4.436349693251534e-05, "loss": 0.6693, "step": 292 }, { "epoch": 0.6059979317476732, "grad_norm": 0.4033926687671075, "learning_rate": 4.432515337423313e-05, "loss": 0.8021, "step": 293 }, { "epoch": 0.6080661840744571, "grad_norm": 0.40228183218910374, "learning_rate": 4.4286809815950926e-05, "loss": 0.559, "step": 294 }, { "epoch": 0.610134436401241, "grad_norm": 0.3912842354998142, "learning_rate": 4.424846625766871e-05, "loss": 0.7229, "step": 295 }, { "epoch": 0.6122026887280249, "grad_norm": 0.42332817954179025, "learning_rate": 4.4210122699386504e-05, "loss": 0.7244, "step": 296 }, { "epoch": 0.6142709410548087, "grad_norm": 0.43014857039372695, "learning_rate": 4.41717791411043e-05, "loss": 0.7013, "step": 297 }, { "epoch": 0.6163391933815926, "grad_norm": 0.4420329275626106, "learning_rate": 4.413343558282208e-05, "loss": 0.7254, "step": 298 }, { "epoch": 0.6184074457083765, "grad_norm": 0.39820624245244557, "learning_rate": 4.409509202453988e-05, "loss": 0.7599, "step": 299 }, { "epoch": 0.6204756980351603, "grad_norm": 0.43151519998719967, "learning_rate": 4.405674846625767e-05, "loss": 0.7607, "step": 300 }, { "epoch": 0.6225439503619442, "grad_norm": 0.39544613047234783, "learning_rate": 4.4018404907975466e-05, "loss": 0.5897, "step": 301 }, { "epoch": 0.6246122026887281, "grad_norm": 0.3762152067468455, "learning_rate": 4.398006134969325e-05, "loss": 0.6777, "step": 302 }, { "epoch": 0.6266804550155118, "grad_norm": 0.6444797821255654, "learning_rate": 4.3941717791411044e-05, "loss": 0.7662, "step": 303 }, { "epoch": 0.6287487073422957, "grad_norm": 0.39168044829961624, "learning_rate": 4.390337423312884e-05, "loss": 0.6475, "step": 304 }, { "epoch": 0.6308169596690796, "grad_norm": 0.38590514445664986, "learning_rate": 4.386503067484663e-05, "loss": 0.6432, "step": 305 }, { "epoch": 0.6328852119958635, "grad_norm": 0.359675822497109, "learning_rate": 4.382668711656442e-05, "loss": 0.6778, "step": 306 }, { "epoch": 0.6349534643226473, "grad_norm": 0.386749636368913, "learning_rate": 4.3788343558282214e-05, "loss": 0.6623, "step": 307 }, { "epoch": 0.6370217166494312, "grad_norm": 0.3899926660352858, "learning_rate": 4.375e-05, "loss": 0.7259, "step": 308 }, { "epoch": 0.6390899689762151, "grad_norm": 0.3664504082553066, "learning_rate": 4.371165644171779e-05, "loss": 0.6053, "step": 309 }, { "epoch": 0.6411582213029989, "grad_norm": 0.42960860050384375, "learning_rate": 4.3673312883435584e-05, "loss": 0.68, "step": 310 }, { "epoch": 0.6432264736297828, "grad_norm": 0.37799035143944265, "learning_rate": 4.363496932515338e-05, "loss": 0.6578, "step": 311 }, { "epoch": 0.6452947259565667, "grad_norm": 0.39139122356076633, "learning_rate": 4.359662576687117e-05, "loss": 0.702, "step": 312 }, { "epoch": 0.6473629782833505, "grad_norm": 0.3832460883016103, "learning_rate": 4.3558282208588955e-05, "loss": 0.6644, "step": 313 }, { "epoch": 0.6494312306101344, "grad_norm": 1.4719489016101033, "learning_rate": 4.3519938650306754e-05, "loss": 0.6755, "step": 314 }, { "epoch": 0.6514994829369183, "grad_norm": 0.4342331238723843, "learning_rate": 4.348159509202454e-05, "loss": 0.7983, "step": 315 }, { "epoch": 0.6535677352637022, "grad_norm": 0.37680415266345046, "learning_rate": 4.344325153374233e-05, "loss": 0.7081, "step": 316 }, { "epoch": 0.655635987590486, "grad_norm": 1.5812084660839276, "learning_rate": 4.3404907975460124e-05, "loss": 0.7062, "step": 317 }, { "epoch": 0.6577042399172699, "grad_norm": 0.5912116086867399, "learning_rate": 4.336656441717792e-05, "loss": 0.623, "step": 318 }, { "epoch": 0.6597724922440538, "grad_norm": 0.4327177721938775, "learning_rate": 4.332822085889571e-05, "loss": 0.6602, "step": 319 }, { "epoch": 0.6618407445708376, "grad_norm": 0.5591161299231283, "learning_rate": 4.32898773006135e-05, "loss": 0.7607, "step": 320 }, { "epoch": 0.6639089968976215, "grad_norm": 0.36922008351179386, "learning_rate": 4.3251533742331294e-05, "loss": 0.5734, "step": 321 }, { "epoch": 0.6659772492244054, "grad_norm": 0.5749294191530909, "learning_rate": 4.321319018404908e-05, "loss": 0.777, "step": 322 }, { "epoch": 0.6680455015511892, "grad_norm": 0.4060150425474482, "learning_rate": 4.317484662576687e-05, "loss": 0.5635, "step": 323 }, { "epoch": 0.6701137538779731, "grad_norm": 0.4785042017735117, "learning_rate": 4.3136503067484664e-05, "loss": 0.601, "step": 324 }, { "epoch": 0.672182006204757, "grad_norm": 0.4777653950312122, "learning_rate": 4.309815950920246e-05, "loss": 0.7437, "step": 325 }, { "epoch": 0.6742502585315409, "grad_norm": 0.5326763106562807, "learning_rate": 4.305981595092025e-05, "loss": 0.7303, "step": 326 }, { "epoch": 0.6763185108583247, "grad_norm": 0.39002521960079595, "learning_rate": 4.302147239263804e-05, "loss": 0.6992, "step": 327 }, { "epoch": 0.6783867631851086, "grad_norm": 0.48589789849205034, "learning_rate": 4.298312883435583e-05, "loss": 0.6537, "step": 328 }, { "epoch": 0.6804550155118925, "grad_norm": 0.36784847365416046, "learning_rate": 4.2944785276073626e-05, "loss": 0.6014, "step": 329 }, { "epoch": 0.6825232678386763, "grad_norm": 0.4497963486880099, "learning_rate": 4.290644171779141e-05, "loss": 0.6873, "step": 330 }, { "epoch": 0.6845915201654602, "grad_norm": 0.40504474402131, "learning_rate": 4.2868098159509204e-05, "loss": 0.6941, "step": 331 }, { "epoch": 0.6866597724922441, "grad_norm": 0.4699099865065199, "learning_rate": 4.2829754601227e-05, "loss": 0.6931, "step": 332 }, { "epoch": 0.688728024819028, "grad_norm": 0.4846675248848742, "learning_rate": 4.279141104294479e-05, "loss": 0.6461, "step": 333 }, { "epoch": 0.6907962771458118, "grad_norm": 0.3397807274856155, "learning_rate": 4.275306748466258e-05, "loss": 0.676, "step": 334 }, { "epoch": 0.6928645294725957, "grad_norm": 0.424948944002537, "learning_rate": 4.271472392638037e-05, "loss": 0.6692, "step": 335 }, { "epoch": 0.6949327817993796, "grad_norm": 0.38247915991760206, "learning_rate": 4.267638036809816e-05, "loss": 0.7584, "step": 336 }, { "epoch": 0.6970010341261634, "grad_norm": 0.3741220131914702, "learning_rate": 4.263803680981595e-05, "loss": 0.6571, "step": 337 }, { "epoch": 0.6990692864529473, "grad_norm": 0.3689438546767675, "learning_rate": 4.2599693251533744e-05, "loss": 0.6133, "step": 338 }, { "epoch": 0.7011375387797312, "grad_norm": 0.4459760663567278, "learning_rate": 4.2561349693251537e-05, "loss": 0.6731, "step": 339 }, { "epoch": 0.703205791106515, "grad_norm": 0.32641258503659676, "learning_rate": 4.252300613496933e-05, "loss": 0.6906, "step": 340 }, { "epoch": 0.7052740434332989, "grad_norm": 0.44598692946112106, "learning_rate": 4.2484662576687115e-05, "loss": 0.6903, "step": 341 }, { "epoch": 0.7073422957600828, "grad_norm": 0.34007163968527787, "learning_rate": 4.2446319018404914e-05, "loss": 0.7159, "step": 342 }, { "epoch": 0.7094105480868665, "grad_norm": 0.3664062956344628, "learning_rate": 4.24079754601227e-05, "loss": 0.6467, "step": 343 }, { "epoch": 0.7114788004136504, "grad_norm": 0.3146931484242662, "learning_rate": 4.23696319018405e-05, "loss": 0.6519, "step": 344 }, { "epoch": 0.7135470527404343, "grad_norm": 0.40380290791449297, "learning_rate": 4.2331288343558284e-05, "loss": 0.623, "step": 345 }, { "epoch": 0.7156153050672182, "grad_norm": 0.3393677527963375, "learning_rate": 4.229294478527607e-05, "loss": 0.6539, "step": 346 }, { "epoch": 0.717683557394002, "grad_norm": 0.471439467081511, "learning_rate": 4.225460122699387e-05, "loss": 0.6869, "step": 347 }, { "epoch": 0.7197518097207859, "grad_norm": 1.7628078472001265, "learning_rate": 4.2216257668711655e-05, "loss": 0.6843, "step": 348 }, { "epoch": 0.7218200620475698, "grad_norm": 0.5187125676087443, "learning_rate": 4.2177914110429454e-05, "loss": 0.714, "step": 349 }, { "epoch": 0.7238883143743536, "grad_norm": 1.6070857083504344, "learning_rate": 4.213957055214724e-05, "loss": 0.6895, "step": 350 }, { "epoch": 0.7259565667011375, "grad_norm": 0.5955946314443722, "learning_rate": 4.210122699386503e-05, "loss": 0.7838, "step": 351 }, { "epoch": 0.7280248190279214, "grad_norm": 0.3522066846979316, "learning_rate": 4.2062883435582824e-05, "loss": 0.6122, "step": 352 }, { "epoch": 0.7300930713547052, "grad_norm": 0.5616391559487439, "learning_rate": 4.2024539877300617e-05, "loss": 0.7827, "step": 353 }, { "epoch": 0.7321613236814891, "grad_norm": 0.4623474227842011, "learning_rate": 4.198619631901841e-05, "loss": 0.7149, "step": 354 }, { "epoch": 0.734229576008273, "grad_norm": 0.4259678487473483, "learning_rate": 4.19478527607362e-05, "loss": 0.6731, "step": 355 }, { "epoch": 0.7362978283350569, "grad_norm": 0.4063586222181074, "learning_rate": 4.190950920245399e-05, "loss": 0.6743, "step": 356 }, { "epoch": 0.7383660806618407, "grad_norm": 0.4232457947925708, "learning_rate": 4.1871165644171786e-05, "loss": 0.6723, "step": 357 }, { "epoch": 0.7404343329886246, "grad_norm": 0.38274648805166733, "learning_rate": 4.183282208588957e-05, "loss": 0.758, "step": 358 }, { "epoch": 0.7425025853154085, "grad_norm": 0.38915042191860544, "learning_rate": 4.1794478527607364e-05, "loss": 0.6636, "step": 359 }, { "epoch": 0.7445708376421923, "grad_norm": 1.6525109403677467, "learning_rate": 4.1756134969325156e-05, "loss": 0.6811, "step": 360 }, { "epoch": 0.7466390899689762, "grad_norm": 0.47639660759430436, "learning_rate": 4.171779141104294e-05, "loss": 0.7061, "step": 361 }, { "epoch": 0.7487073422957601, "grad_norm": 0.5131983562933468, "learning_rate": 4.167944785276074e-05, "loss": 0.709, "step": 362 }, { "epoch": 0.750775594622544, "grad_norm": 0.5727399522979899, "learning_rate": 4.164110429447853e-05, "loss": 0.7465, "step": 363 }, { "epoch": 0.7528438469493278, "grad_norm": 0.5766606747388124, "learning_rate": 4.160276073619632e-05, "loss": 0.6591, "step": 364 }, { "epoch": 0.7549120992761117, "grad_norm": 0.5147284148485484, "learning_rate": 4.156441717791411e-05, "loss": 0.6705, "step": 365 }, { "epoch": 0.7569803516028956, "grad_norm": 0.5557008795425701, "learning_rate": 4.1526073619631904e-05, "loss": 0.7177, "step": 366 }, { "epoch": 0.7590486039296794, "grad_norm": 0.7090427462962371, "learning_rate": 4.1487730061349696e-05, "loss": 0.7071, "step": 367 }, { "epoch": 0.7611168562564633, "grad_norm": 1.2678188956969185, "learning_rate": 4.144938650306749e-05, "loss": 0.6648, "step": 368 }, { "epoch": 0.7631851085832472, "grad_norm": 0.42738769663310694, "learning_rate": 4.1411042944785274e-05, "loss": 0.5864, "step": 369 }, { "epoch": 0.765253360910031, "grad_norm": 0.40234825440444505, "learning_rate": 4.1372699386503074e-05, "loss": 0.5871, "step": 370 }, { "epoch": 0.7673216132368149, "grad_norm": 0.8324108927120342, "learning_rate": 4.133435582822086e-05, "loss": 0.6347, "step": 371 }, { "epoch": 0.7693898655635988, "grad_norm": 0.5470878407578921, "learning_rate": 4.129601226993865e-05, "loss": 0.6931, "step": 372 }, { "epoch": 0.7714581178903827, "grad_norm": 0.4210071120248633, "learning_rate": 4.1257668711656444e-05, "loss": 0.6399, "step": 373 }, { "epoch": 0.7735263702171665, "grad_norm": 0.502435409102665, "learning_rate": 4.1219325153374236e-05, "loss": 0.7141, "step": 374 }, { "epoch": 0.7755946225439504, "grad_norm": 0.4005691462312516, "learning_rate": 4.118098159509203e-05, "loss": 0.6741, "step": 375 }, { "epoch": 0.7776628748707343, "grad_norm": 0.5744511410887219, "learning_rate": 4.1142638036809814e-05, "loss": 0.6747, "step": 376 }, { "epoch": 0.7797311271975181, "grad_norm": 1.8280751337666792, "learning_rate": 4.1104294478527614e-05, "loss": 0.6161, "step": 377 }, { "epoch": 0.781799379524302, "grad_norm": 0.37815370021228467, "learning_rate": 4.10659509202454e-05, "loss": 0.651, "step": 378 }, { "epoch": 0.7838676318510859, "grad_norm": 0.34132602948143903, "learning_rate": 4.102760736196319e-05, "loss": 0.6509, "step": 379 }, { "epoch": 0.7859358841778697, "grad_norm": 0.3758789123400006, "learning_rate": 4.0989263803680984e-05, "loss": 0.5681, "step": 380 }, { "epoch": 0.7880041365046536, "grad_norm": 0.3617150874290157, "learning_rate": 4.0950920245398776e-05, "loss": 0.6991, "step": 381 }, { "epoch": 0.7900723888314375, "grad_norm": 0.4000637407298211, "learning_rate": 4.091257668711657e-05, "loss": 0.5686, "step": 382 }, { "epoch": 0.7921406411582212, "grad_norm": 0.4053123170710516, "learning_rate": 4.087423312883436e-05, "loss": 0.5868, "step": 383 }, { "epoch": 0.7942088934850051, "grad_norm": 0.38919350784129875, "learning_rate": 4.083588957055215e-05, "loss": 0.6521, "step": 384 }, { "epoch": 0.796277145811789, "grad_norm": 0.42037426200169703, "learning_rate": 4.079754601226994e-05, "loss": 0.599, "step": 385 }, { "epoch": 0.7983453981385729, "grad_norm": 0.4374696374749754, "learning_rate": 4.075920245398773e-05, "loss": 0.7421, "step": 386 }, { "epoch": 0.8004136504653567, "grad_norm": 0.4337482694011715, "learning_rate": 4.0720858895705524e-05, "loss": 0.6701, "step": 387 }, { "epoch": 0.8024819027921406, "grad_norm": 0.31522582656016873, "learning_rate": 4.0682515337423316e-05, "loss": 0.6309, "step": 388 }, { "epoch": 0.8045501551189245, "grad_norm": 0.36937210006748133, "learning_rate": 4.06441717791411e-05, "loss": 0.5424, "step": 389 }, { "epoch": 0.8066184074457083, "grad_norm": 0.45670114772856263, "learning_rate": 4.06058282208589e-05, "loss": 0.7204, "step": 390 }, { "epoch": 0.8086866597724922, "grad_norm": 0.41280765241184775, "learning_rate": 4.056748466257669e-05, "loss": 0.6562, "step": 391 }, { "epoch": 0.8107549120992761, "grad_norm": 0.4148725366101085, "learning_rate": 4.052914110429448e-05, "loss": 0.7744, "step": 392 }, { "epoch": 0.81282316442606, "grad_norm": 0.478350545332851, "learning_rate": 4.049079754601227e-05, "loss": 0.7063, "step": 393 }, { "epoch": 0.8148914167528438, "grad_norm": 0.34886807154178306, "learning_rate": 4.0452453987730064e-05, "loss": 0.6401, "step": 394 }, { "epoch": 0.8169596690796277, "grad_norm": 0.42268438956469756, "learning_rate": 4.0414110429447856e-05, "loss": 0.7071, "step": 395 }, { "epoch": 0.8190279214064116, "grad_norm": 0.38833778011440834, "learning_rate": 4.037576687116564e-05, "loss": 0.5622, "step": 396 }, { "epoch": 0.8210961737331954, "grad_norm": 0.36326634913740996, "learning_rate": 4.033742331288344e-05, "loss": 0.6831, "step": 397 }, { "epoch": 0.8231644260599793, "grad_norm": 0.3733770555013717, "learning_rate": 4.029907975460123e-05, "loss": 0.552, "step": 398 }, { "epoch": 0.8252326783867632, "grad_norm": 0.4224210982053915, "learning_rate": 4.026073619631902e-05, "loss": 0.7099, "step": 399 }, { "epoch": 0.827300930713547, "grad_norm": 0.36950452299026443, "learning_rate": 4.022239263803681e-05, "loss": 0.6293, "step": 400 }, { "epoch": 0.8293691830403309, "grad_norm": 18.619455471041586, "learning_rate": 4.0184049079754604e-05, "loss": 0.7364, "step": 401 }, { "epoch": 0.8314374353671148, "grad_norm": 0.6945857391982944, "learning_rate": 4.0145705521472396e-05, "loss": 0.6712, "step": 402 }, { "epoch": 0.8335056876938987, "grad_norm": 0.4026270657239264, "learning_rate": 4.010736196319019e-05, "loss": 0.6076, "step": 403 }, { "epoch": 0.8355739400206825, "grad_norm": 0.5793185434470874, "learning_rate": 4.0069018404907974e-05, "loss": 0.6666, "step": 404 }, { "epoch": 0.8376421923474664, "grad_norm": 0.5261144045489929, "learning_rate": 4.0030674846625773e-05, "loss": 0.6995, "step": 405 }, { "epoch": 0.8397104446742503, "grad_norm": 0.39950224673410417, "learning_rate": 3.999233128834356e-05, "loss": 0.5701, "step": 406 }, { "epoch": 0.8417786970010341, "grad_norm": 0.4507781230970944, "learning_rate": 3.995398773006135e-05, "loss": 0.5851, "step": 407 }, { "epoch": 0.843846949327818, "grad_norm": 0.4591832369985359, "learning_rate": 3.9915644171779144e-05, "loss": 0.6447, "step": 408 }, { "epoch": 0.8459152016546019, "grad_norm": 0.41654751151992325, "learning_rate": 3.987730061349693e-05, "loss": 0.73, "step": 409 }, { "epoch": 0.8479834539813857, "grad_norm": 0.5552235897532064, "learning_rate": 3.983895705521473e-05, "loss": 0.6607, "step": 410 }, { "epoch": 0.8500517063081696, "grad_norm": 0.4306205078517733, "learning_rate": 3.9800613496932514e-05, "loss": 0.6838, "step": 411 }, { "epoch": 0.8521199586349535, "grad_norm": 0.47942265616396645, "learning_rate": 3.9762269938650307e-05, "loss": 0.6655, "step": 412 }, { "epoch": 0.8541882109617374, "grad_norm": 0.4247148555456135, "learning_rate": 3.97239263803681e-05, "loss": 0.7819, "step": 413 }, { "epoch": 0.8562564632885212, "grad_norm": 0.49273160621376816, "learning_rate": 3.968558282208589e-05, "loss": 0.7044, "step": 414 }, { "epoch": 0.8583247156153051, "grad_norm": 0.4812789289664582, "learning_rate": 3.9647239263803684e-05, "loss": 0.7042, "step": 415 }, { "epoch": 0.860392967942089, "grad_norm": 0.47009337827636916, "learning_rate": 3.9608895705521476e-05, "loss": 0.6864, "step": 416 }, { "epoch": 0.8624612202688728, "grad_norm": 0.4890513974387533, "learning_rate": 3.957055214723926e-05, "loss": 0.8008, "step": 417 }, { "epoch": 0.8645294725956567, "grad_norm": 0.37765923517892747, "learning_rate": 3.953220858895706e-05, "loss": 0.7028, "step": 418 }, { "epoch": 0.8665977249224406, "grad_norm": 0.334866265650456, "learning_rate": 3.9493865030674847e-05, "loss": 0.5616, "step": 419 }, { "epoch": 0.8686659772492245, "grad_norm": 0.32848868295773237, "learning_rate": 3.9455521472392646e-05, "loss": 0.6317, "step": 420 }, { "epoch": 0.8707342295760083, "grad_norm": 0.39807786469176193, "learning_rate": 3.941717791411043e-05, "loss": 0.6414, "step": 421 }, { "epoch": 0.8728024819027922, "grad_norm": 0.33662521258553346, "learning_rate": 3.937883435582822e-05, "loss": 0.6554, "step": 422 }, { "epoch": 0.8748707342295761, "grad_norm": 0.336074533140799, "learning_rate": 3.9340490797546016e-05, "loss": 0.7284, "step": 423 }, { "epoch": 0.8769389865563598, "grad_norm": 0.4899778688182057, "learning_rate": 3.93021472392638e-05, "loss": 0.6326, "step": 424 }, { "epoch": 0.8790072388831437, "grad_norm": 0.4060844755096083, "learning_rate": 3.92638036809816e-05, "loss": 0.8007, "step": 425 }, { "epoch": 0.8810754912099276, "grad_norm": 0.32850915181950563, "learning_rate": 3.9225460122699387e-05, "loss": 0.6001, "step": 426 }, { "epoch": 0.8831437435367114, "grad_norm": 0.37241327194341356, "learning_rate": 3.918711656441718e-05, "loss": 0.5862, "step": 427 }, { "epoch": 0.8852119958634953, "grad_norm": 0.3874208537076751, "learning_rate": 3.914877300613497e-05, "loss": 0.6459, "step": 428 }, { "epoch": 0.8872802481902792, "grad_norm": 0.38389865480520374, "learning_rate": 3.9110429447852764e-05, "loss": 0.6428, "step": 429 }, { "epoch": 0.889348500517063, "grad_norm": 0.4123246523133085, "learning_rate": 3.9072085889570556e-05, "loss": 0.5678, "step": 430 }, { "epoch": 0.8914167528438469, "grad_norm": 0.43149230450759885, "learning_rate": 3.903374233128835e-05, "loss": 0.7038, "step": 431 }, { "epoch": 0.8934850051706308, "grad_norm": 0.4816374704626401, "learning_rate": 3.8995398773006134e-05, "loss": 0.6906, "step": 432 }, { "epoch": 0.8955532574974147, "grad_norm": 0.3337531787422873, "learning_rate": 3.895705521472393e-05, "loss": 0.6668, "step": 433 }, { "epoch": 0.8976215098241985, "grad_norm": 0.3473051007642662, "learning_rate": 3.891871165644172e-05, "loss": 0.6392, "step": 434 }, { "epoch": 0.8996897621509824, "grad_norm": 0.40890472215186563, "learning_rate": 3.888036809815951e-05, "loss": 0.6618, "step": 435 }, { "epoch": 0.9017580144777663, "grad_norm": 0.36122401035375995, "learning_rate": 3.8842024539877304e-05, "loss": 0.6567, "step": 436 }, { "epoch": 0.9038262668045501, "grad_norm": 0.3663680702410421, "learning_rate": 3.880368098159509e-05, "loss": 0.6815, "step": 437 }, { "epoch": 0.905894519131334, "grad_norm": 0.3335874218488767, "learning_rate": 3.876533742331289e-05, "loss": 0.578, "step": 438 }, { "epoch": 0.9079627714581179, "grad_norm": 0.3602108059260039, "learning_rate": 3.8726993865030674e-05, "loss": 0.607, "step": 439 }, { "epoch": 0.9100310237849017, "grad_norm": 0.3426351686304335, "learning_rate": 3.8688650306748466e-05, "loss": 0.7768, "step": 440 }, { "epoch": 0.9120992761116856, "grad_norm": 0.3708021868520424, "learning_rate": 3.865030674846626e-05, "loss": 0.6488, "step": 441 }, { "epoch": 0.9141675284384695, "grad_norm": 0.4151840382185296, "learning_rate": 3.861196319018405e-05, "loss": 0.676, "step": 442 }, { "epoch": 0.9162357807652534, "grad_norm": 0.4432297352397585, "learning_rate": 3.8573619631901844e-05, "loss": 0.7127, "step": 443 }, { "epoch": 0.9183040330920372, "grad_norm": 0.37444698296684137, "learning_rate": 3.8535276073619636e-05, "loss": 0.6276, "step": 444 }, { "epoch": 0.9203722854188211, "grad_norm": 0.35593538407835296, "learning_rate": 3.849693251533742e-05, "loss": 0.5868, "step": 445 }, { "epoch": 0.922440537745605, "grad_norm": 0.31600306069058587, "learning_rate": 3.8458588957055214e-05, "loss": 0.6271, "step": 446 }, { "epoch": 0.9245087900723888, "grad_norm": 0.36754594616124736, "learning_rate": 3.8420245398773006e-05, "loss": 0.634, "step": 447 }, { "epoch": 0.9265770423991727, "grad_norm": 0.33000856296277775, "learning_rate": 3.83819018404908e-05, "loss": 0.686, "step": 448 }, { "epoch": 0.9286452947259566, "grad_norm": 0.37104200406662446, "learning_rate": 3.834355828220859e-05, "loss": 0.7284, "step": 449 }, { "epoch": 0.9307135470527405, "grad_norm": 0.3301466515975829, "learning_rate": 3.8305214723926384e-05, "loss": 0.5512, "step": 450 }, { "epoch": 0.9327817993795243, "grad_norm": 0.3627959607826997, "learning_rate": 3.8266871165644176e-05, "loss": 0.6269, "step": 451 }, { "epoch": 0.9348500517063082, "grad_norm": 0.33739237936787936, "learning_rate": 3.822852760736196e-05, "loss": 0.6402, "step": 452 }, { "epoch": 0.9369183040330921, "grad_norm": 0.33156823315512196, "learning_rate": 3.819018404907976e-05, "loss": 0.6404, "step": 453 }, { "epoch": 0.9389865563598759, "grad_norm": 0.37416544292459025, "learning_rate": 3.8151840490797546e-05, "loss": 0.6524, "step": 454 }, { "epoch": 0.9410548086866598, "grad_norm": 0.3955715434235705, "learning_rate": 3.811349693251534e-05, "loss": 0.7204, "step": 455 }, { "epoch": 0.9431230610134437, "grad_norm": 0.35515536832326566, "learning_rate": 3.807515337423313e-05, "loss": 0.6354, "step": 456 }, { "epoch": 0.9451913133402275, "grad_norm": 0.3944112018274409, "learning_rate": 3.8036809815950924e-05, "loss": 0.736, "step": 457 }, { "epoch": 0.9472595656670114, "grad_norm": 0.4074779783459248, "learning_rate": 3.7998466257668716e-05, "loss": 0.7517, "step": 458 }, { "epoch": 0.9493278179937953, "grad_norm": 0.32456958410782955, "learning_rate": 3.79601226993865e-05, "loss": 0.657, "step": 459 }, { "epoch": 0.9513960703205792, "grad_norm": 0.39891144328208855, "learning_rate": 3.7921779141104294e-05, "loss": 0.6779, "step": 460 }, { "epoch": 0.953464322647363, "grad_norm": 0.39671734493299715, "learning_rate": 3.7883435582822086e-05, "loss": 0.6222, "step": 461 }, { "epoch": 0.9555325749741469, "grad_norm": 0.3744174141215214, "learning_rate": 3.784509202453988e-05, "loss": 0.5845, "step": 462 }, { "epoch": 0.9576008273009308, "grad_norm": 0.337766073253915, "learning_rate": 3.780674846625767e-05, "loss": 0.6298, "step": 463 }, { "epoch": 0.9596690796277145, "grad_norm": 0.37927995932822306, "learning_rate": 3.7768404907975464e-05, "loss": 0.6278, "step": 464 }, { "epoch": 0.9617373319544984, "grad_norm": 0.348997383525954, "learning_rate": 3.773006134969325e-05, "loss": 0.6576, "step": 465 }, { "epoch": 0.9638055842812823, "grad_norm": 0.39022611417665753, "learning_rate": 3.769171779141105e-05, "loss": 0.6742, "step": 466 }, { "epoch": 0.9658738366080661, "grad_norm": 0.33652200106412206, "learning_rate": 3.7653374233128834e-05, "loss": 0.5891, "step": 467 }, { "epoch": 0.96794208893485, "grad_norm": 0.38780981263108044, "learning_rate": 3.7615030674846626e-05, "loss": 0.7018, "step": 468 }, { "epoch": 0.9700103412616339, "grad_norm": 0.35788957548138595, "learning_rate": 3.757668711656442e-05, "loss": 0.6537, "step": 469 }, { "epoch": 0.9720785935884177, "grad_norm": 0.38597060406212097, "learning_rate": 3.753834355828221e-05, "loss": 0.6617, "step": 470 }, { "epoch": 0.9741468459152016, "grad_norm": 0.40187559007990686, "learning_rate": 3.7500000000000003e-05, "loss": 0.6186, "step": 471 }, { "epoch": 0.9762150982419855, "grad_norm": 0.38717174542217797, "learning_rate": 3.746165644171779e-05, "loss": 0.575, "step": 472 }, { "epoch": 0.9782833505687694, "grad_norm": 0.39455012289497304, "learning_rate": 3.742331288343559e-05, "loss": 0.6724, "step": 473 }, { "epoch": 0.9803516028955532, "grad_norm": 0.35686811847644856, "learning_rate": 3.7384969325153374e-05, "loss": 0.6756, "step": 474 }, { "epoch": 0.9824198552223371, "grad_norm": 0.3709858456107756, "learning_rate": 3.7346625766871166e-05, "loss": 0.6974, "step": 475 }, { "epoch": 0.984488107549121, "grad_norm": 0.34622251149211264, "learning_rate": 3.730828220858896e-05, "loss": 0.6423, "step": 476 }, { "epoch": 0.9865563598759048, "grad_norm": 0.4411009531373164, "learning_rate": 3.726993865030675e-05, "loss": 0.7037, "step": 477 }, { "epoch": 0.9886246122026887, "grad_norm": 0.3474961135599018, "learning_rate": 3.7231595092024543e-05, "loss": 0.6285, "step": 478 }, { "epoch": 0.9906928645294726, "grad_norm": 0.40687523123074604, "learning_rate": 3.7193251533742336e-05, "loss": 0.6936, "step": 479 }, { "epoch": 0.9927611168562565, "grad_norm": 0.36919788663049213, "learning_rate": 3.715490797546012e-05, "loss": 0.5929, "step": 480 }, { "epoch": 0.9948293691830403, "grad_norm": 0.3928621021189457, "learning_rate": 3.711656441717792e-05, "loss": 0.5499, "step": 481 }, { "epoch": 0.9968976215098242, "grad_norm": 0.3696028326694325, "learning_rate": 3.7078220858895706e-05, "loss": 0.6807, "step": 482 }, { "epoch": 0.9989658738366081, "grad_norm": 0.3715508833604711, "learning_rate": 3.70398773006135e-05, "loss": 0.5937, "step": 483 }, { "epoch": 1.0, "grad_norm": 0.3715508833604711, "learning_rate": 3.700153374233129e-05, "loss": 0.6648, "step": 484 }, { "epoch": 1.0020682523267839, "grad_norm": 0.5759518846073001, "learning_rate": 3.696319018404908e-05, "loss": 0.5782, "step": 485 }, { "epoch": 1.0041365046535677, "grad_norm": 0.384554982925459, "learning_rate": 3.6924846625766876e-05, "loss": 0.5261, "step": 486 }, { "epoch": 1.0062047569803516, "grad_norm": 0.36295620866593986, "learning_rate": 3.688650306748466e-05, "loss": 0.5615, "step": 487 }, { "epoch": 1.0082730093071355, "grad_norm": 0.38690380714740535, "learning_rate": 3.6848159509202454e-05, "loss": 0.5413, "step": 488 }, { "epoch": 1.0103412616339194, "grad_norm": 0.3579588697863454, "learning_rate": 3.6809815950920246e-05, "loss": 0.5578, "step": 489 }, { "epoch": 1.0124095139607032, "grad_norm": 0.5883565226964408, "learning_rate": 3.677147239263804e-05, "loss": 0.5064, "step": 490 }, { "epoch": 1.014477766287487, "grad_norm": 0.394009675567436, "learning_rate": 3.673312883435583e-05, "loss": 0.4903, "step": 491 }, { "epoch": 1.016546018614271, "grad_norm": 0.36876773845450544, "learning_rate": 3.669478527607362e-05, "loss": 0.5061, "step": 492 }, { "epoch": 1.0186142709410548, "grad_norm": 0.35167213047039275, "learning_rate": 3.665644171779141e-05, "loss": 0.4988, "step": 493 }, { "epoch": 1.0206825232678387, "grad_norm": 0.38322081243029493, "learning_rate": 3.661809815950921e-05, "loss": 0.5097, "step": 494 }, { "epoch": 1.0227507755946226, "grad_norm": 0.32571062822682945, "learning_rate": 3.6579754601226994e-05, "loss": 0.5, "step": 495 }, { "epoch": 1.0248190279214064, "grad_norm": 0.3839042911375249, "learning_rate": 3.654141104294479e-05, "loss": 0.5898, "step": 496 }, { "epoch": 1.0268872802481903, "grad_norm": 0.6208252799607458, "learning_rate": 3.650306748466258e-05, "loss": 0.5828, "step": 497 }, { "epoch": 1.0289555325749742, "grad_norm": 0.33099412875372897, "learning_rate": 3.6464723926380364e-05, "loss": 0.5173, "step": 498 }, { "epoch": 1.031023784901758, "grad_norm": 0.35682454226697546, "learning_rate": 3.642638036809816e-05, "loss": 0.5593, "step": 499 }, { "epoch": 1.033092037228542, "grad_norm": 0.34246877906252615, "learning_rate": 3.638803680981595e-05, "loss": 0.4821, "step": 500 }, { "epoch": 1.0351602895553258, "grad_norm": 0.2719130860773164, "learning_rate": 3.634969325153375e-05, "loss": 0.5473, "step": 501 }, { "epoch": 1.0372285418821097, "grad_norm": 0.3842749485639536, "learning_rate": 3.6311349693251534e-05, "loss": 0.5447, "step": 502 }, { "epoch": 1.0392967942088935, "grad_norm": 0.30206813638417385, "learning_rate": 3.6273006134969326e-05, "loss": 0.5937, "step": 503 }, { "epoch": 1.0413650465356774, "grad_norm": 0.36372272217337803, "learning_rate": 3.623466257668712e-05, "loss": 0.4784, "step": 504 }, { "epoch": 1.0434332988624613, "grad_norm": 0.32913417526000177, "learning_rate": 3.619631901840491e-05, "loss": 0.5332, "step": 505 }, { "epoch": 1.0455015511892451, "grad_norm": 0.3843673323872735, "learning_rate": 3.61579754601227e-05, "loss": 0.5449, "step": 506 }, { "epoch": 1.047569803516029, "grad_norm": 0.3518633552772409, "learning_rate": 3.6119631901840496e-05, "loss": 0.4814, "step": 507 }, { "epoch": 1.049638055842813, "grad_norm": 0.3683850983782102, "learning_rate": 3.608128834355828e-05, "loss": 0.5509, "step": 508 }, { "epoch": 1.0517063081695968, "grad_norm": 0.3467886350104457, "learning_rate": 3.6042944785276074e-05, "loss": 0.5507, "step": 509 }, { "epoch": 1.0537745604963806, "grad_norm": 0.30062810346846147, "learning_rate": 3.6004601226993866e-05, "loss": 0.4987, "step": 510 }, { "epoch": 1.0558428128231645, "grad_norm": 0.3301401667693514, "learning_rate": 3.596625766871166e-05, "loss": 0.5037, "step": 511 }, { "epoch": 1.0579110651499484, "grad_norm": 0.33188902478807464, "learning_rate": 3.592791411042945e-05, "loss": 0.5239, "step": 512 }, { "epoch": 1.0599793174767322, "grad_norm": 0.31433978137081864, "learning_rate": 3.5889570552147236e-05, "loss": 0.4904, "step": 513 }, { "epoch": 1.0620475698035161, "grad_norm": 0.3507959498463901, "learning_rate": 3.5851226993865036e-05, "loss": 0.5366, "step": 514 }, { "epoch": 1.0641158221303, "grad_norm": 0.46924859061692564, "learning_rate": 3.581288343558282e-05, "loss": 0.5713, "step": 515 }, { "epoch": 1.0661840744570839, "grad_norm": 0.33395185472589617, "learning_rate": 3.5774539877300614e-05, "loss": 0.5507, "step": 516 }, { "epoch": 1.0682523267838677, "grad_norm": 0.29989405679692865, "learning_rate": 3.5736196319018406e-05, "loss": 0.5612, "step": 517 }, { "epoch": 1.0703205791106516, "grad_norm": 0.3571201023953414, "learning_rate": 3.56978527607362e-05, "loss": 0.6449, "step": 518 }, { "epoch": 1.0723888314374355, "grad_norm": 0.338364840512957, "learning_rate": 3.565950920245399e-05, "loss": 0.5994, "step": 519 }, { "epoch": 1.0744570837642193, "grad_norm": 0.3757863422955221, "learning_rate": 3.562116564417178e-05, "loss": 0.5517, "step": 520 }, { "epoch": 1.0765253360910032, "grad_norm": 0.33117087593605254, "learning_rate": 3.558282208588957e-05, "loss": 0.5309, "step": 521 }, { "epoch": 1.078593588417787, "grad_norm": 0.3475244242246385, "learning_rate": 3.554447852760736e-05, "loss": 0.5021, "step": 522 }, { "epoch": 1.080661840744571, "grad_norm": 5.397113090473603, "learning_rate": 3.5506134969325154e-05, "loss": 0.5494, "step": 523 }, { "epoch": 1.0827300930713548, "grad_norm": 0.3599550847771871, "learning_rate": 3.5467791411042946e-05, "loss": 0.5577, "step": 524 }, { "epoch": 1.0847983453981387, "grad_norm": 0.32322271528248875, "learning_rate": 3.542944785276074e-05, "loss": 0.4872, "step": 525 }, { "epoch": 1.0868665977249226, "grad_norm": 0.3102650920482785, "learning_rate": 3.539110429447853e-05, "loss": 0.5564, "step": 526 }, { "epoch": 1.0889348500517062, "grad_norm": 0.3010727891535726, "learning_rate": 3.535276073619632e-05, "loss": 0.5406, "step": 527 }, { "epoch": 1.09100310237849, "grad_norm": 0.31211740971915763, "learning_rate": 3.531441717791411e-05, "loss": 0.5507, "step": 528 }, { "epoch": 1.093071354705274, "grad_norm": 0.3203054529646532, "learning_rate": 3.527607361963191e-05, "loss": 0.6251, "step": 529 }, { "epoch": 1.0951396070320578, "grad_norm": 0.4296782982399442, "learning_rate": 3.5237730061349694e-05, "loss": 0.539, "step": 530 }, { "epoch": 1.0972078593588417, "grad_norm": 0.28469427042620954, "learning_rate": 3.5199386503067486e-05, "loss": 0.4783, "step": 531 }, { "epoch": 1.0992761116856256, "grad_norm": 0.3740011637674365, "learning_rate": 3.516104294478528e-05, "loss": 0.5677, "step": 532 }, { "epoch": 1.1013443640124094, "grad_norm": 0.35893277054103645, "learning_rate": 3.512269938650307e-05, "loss": 0.6146, "step": 533 }, { "epoch": 1.1034126163391933, "grad_norm": 0.3063161125003297, "learning_rate": 3.508435582822086e-05, "loss": 0.6386, "step": 534 }, { "epoch": 1.1054808686659772, "grad_norm": 0.3613499624019987, "learning_rate": 3.504601226993865e-05, "loss": 0.5327, "step": 535 }, { "epoch": 1.107549120992761, "grad_norm": 0.3092040009776186, "learning_rate": 3.500766871165644e-05, "loss": 0.5627, "step": 536 }, { "epoch": 1.109617373319545, "grad_norm": 0.3726534229610569, "learning_rate": 3.4969325153374234e-05, "loss": 0.5316, "step": 537 }, { "epoch": 1.1116856256463288, "grad_norm": 0.28273126091963224, "learning_rate": 3.4930981595092026e-05, "loss": 0.5633, "step": 538 }, { "epoch": 1.1137538779731126, "grad_norm": 0.35014685528643774, "learning_rate": 3.489263803680982e-05, "loss": 0.5251, "step": 539 }, { "epoch": 1.1158221302998965, "grad_norm": 0.32549714238967625, "learning_rate": 3.485429447852761e-05, "loss": 0.5982, "step": 540 }, { "epoch": 1.1178903826266804, "grad_norm": 0.3136500847280185, "learning_rate": 3.4815950920245396e-05, "loss": 0.5241, "step": 541 }, { "epoch": 1.1199586349534643, "grad_norm": 0.28817079175373544, "learning_rate": 3.4777607361963196e-05, "loss": 0.65, "step": 542 }, { "epoch": 1.1220268872802481, "grad_norm": 0.3547514691515379, "learning_rate": 3.473926380368098e-05, "loss": 0.5155, "step": 543 }, { "epoch": 1.124095139607032, "grad_norm": 0.29010449235842034, "learning_rate": 3.470092024539878e-05, "loss": 0.5487, "step": 544 }, { "epoch": 1.1261633919338159, "grad_norm": 0.3197474843863331, "learning_rate": 3.4662576687116566e-05, "loss": 0.5489, "step": 545 }, { "epoch": 1.1282316442605997, "grad_norm": 0.32818095041934137, "learning_rate": 3.462423312883436e-05, "loss": 0.4879, "step": 546 }, { "epoch": 1.1302998965873836, "grad_norm": 0.3425259915408221, "learning_rate": 3.458588957055215e-05, "loss": 0.5466, "step": 547 }, { "epoch": 1.1323681489141675, "grad_norm": 0.30535100044488966, "learning_rate": 3.4547546012269936e-05, "loss": 0.5671, "step": 548 }, { "epoch": 1.1344364012409514, "grad_norm": 0.39347452469995464, "learning_rate": 3.4509202453987735e-05, "loss": 0.5474, "step": 549 }, { "epoch": 1.1365046535677352, "grad_norm": 0.3057380142013982, "learning_rate": 3.447085889570552e-05, "loss": 0.4905, "step": 550 }, { "epoch": 1.138572905894519, "grad_norm": 0.3466035211677145, "learning_rate": 3.4432515337423313e-05, "loss": 0.4716, "step": 551 }, { "epoch": 1.140641158221303, "grad_norm": 0.3181606126412205, "learning_rate": 3.4394171779141106e-05, "loss": 0.5681, "step": 552 }, { "epoch": 1.1427094105480868, "grad_norm": 0.2868650020991315, "learning_rate": 3.43558282208589e-05, "loss": 0.5238, "step": 553 }, { "epoch": 1.1447776628748707, "grad_norm": 0.30249791127193276, "learning_rate": 3.431748466257669e-05, "loss": 0.5385, "step": 554 }, { "epoch": 1.1468459152016546, "grad_norm": 0.326593603903082, "learning_rate": 3.427914110429448e-05, "loss": 0.5151, "step": 555 }, { "epoch": 1.1489141675284384, "grad_norm": 0.31749656874686055, "learning_rate": 3.424079754601227e-05, "loss": 0.4808, "step": 556 }, { "epoch": 1.1509824198552223, "grad_norm": 0.29104445437637766, "learning_rate": 3.420245398773007e-05, "loss": 0.4846, "step": 557 }, { "epoch": 1.1530506721820062, "grad_norm": 0.35079468112351775, "learning_rate": 3.4164110429447853e-05, "loss": 0.4672, "step": 558 }, { "epoch": 1.15511892450879, "grad_norm": 0.37389543448483903, "learning_rate": 3.4125766871165646e-05, "loss": 0.487, "step": 559 }, { "epoch": 1.157187176835574, "grad_norm": 0.3862212335066802, "learning_rate": 3.408742331288344e-05, "loss": 0.6734, "step": 560 }, { "epoch": 1.1592554291623578, "grad_norm": 0.32121781317814607, "learning_rate": 3.4049079754601224e-05, "loss": 0.6159, "step": 561 }, { "epoch": 1.1613236814891417, "grad_norm": 0.7977587174741722, "learning_rate": 3.401073619631902e-05, "loss": 0.607, "step": 562 }, { "epoch": 1.1633919338159255, "grad_norm": 0.342445666641679, "learning_rate": 3.397239263803681e-05, "loss": 0.5283, "step": 563 }, { "epoch": 1.1654601861427094, "grad_norm": 0.33073937176113055, "learning_rate": 3.39340490797546e-05, "loss": 0.5429, "step": 564 }, { "epoch": 1.1675284384694933, "grad_norm": 0.40606645956951337, "learning_rate": 3.3895705521472393e-05, "loss": 0.5987, "step": 565 }, { "epoch": 1.1695966907962771, "grad_norm": 0.38677341924924585, "learning_rate": 3.3857361963190186e-05, "loss": 0.5155, "step": 566 }, { "epoch": 1.171664943123061, "grad_norm": 0.38308822469015213, "learning_rate": 3.381901840490798e-05, "loss": 0.5094, "step": 567 }, { "epoch": 1.173733195449845, "grad_norm": 0.3010897248635725, "learning_rate": 3.378067484662577e-05, "loss": 0.4963, "step": 568 }, { "epoch": 1.1758014477766288, "grad_norm": 0.344998311918829, "learning_rate": 3.3742331288343556e-05, "loss": 0.4393, "step": 569 }, { "epoch": 1.1778697001034126, "grad_norm": 0.3266974930414999, "learning_rate": 3.3703987730061355e-05, "loss": 0.6224, "step": 570 }, { "epoch": 1.1799379524301965, "grad_norm": 0.36302392510924925, "learning_rate": 3.366564417177914e-05, "loss": 0.5029, "step": 571 }, { "epoch": 1.1820062047569804, "grad_norm": 0.3548945875565279, "learning_rate": 3.362730061349693e-05, "loss": 0.5483, "step": 572 }, { "epoch": 1.1840744570837642, "grad_norm": 0.3666450790269034, "learning_rate": 3.3588957055214726e-05, "loss": 0.581, "step": 573 }, { "epoch": 1.1861427094105481, "grad_norm": 0.3692870960426435, "learning_rate": 3.355061349693251e-05, "loss": 0.5549, "step": 574 }, { "epoch": 1.188210961737332, "grad_norm": 0.36835965249017244, "learning_rate": 3.351226993865031e-05, "loss": 0.5762, "step": 575 }, { "epoch": 1.1902792140641159, "grad_norm": 0.3114729725566869, "learning_rate": 3.3473926380368096e-05, "loss": 0.5311, "step": 576 }, { "epoch": 1.1923474663908997, "grad_norm": 0.3347192221584002, "learning_rate": 3.3435582822085895e-05, "loss": 0.5575, "step": 577 }, { "epoch": 1.1944157187176836, "grad_norm": 0.43115382341957503, "learning_rate": 3.339723926380368e-05, "loss": 0.6021, "step": 578 }, { "epoch": 1.1964839710444675, "grad_norm": 0.30558846355947544, "learning_rate": 3.335889570552147e-05, "loss": 0.5012, "step": 579 }, { "epoch": 1.1985522233712513, "grad_norm": 0.3392194748221668, "learning_rate": 3.3320552147239266e-05, "loss": 0.532, "step": 580 }, { "epoch": 1.2006204756980352, "grad_norm": 0.2652361011172681, "learning_rate": 3.328220858895706e-05, "loss": 0.5296, "step": 581 }, { "epoch": 1.202688728024819, "grad_norm": 0.2783826410030279, "learning_rate": 3.324386503067485e-05, "loss": 0.5561, "step": 582 }, { "epoch": 1.204756980351603, "grad_norm": 0.3679598855985759, "learning_rate": 3.320552147239264e-05, "loss": 0.6222, "step": 583 }, { "epoch": 1.2068252326783868, "grad_norm": 0.26919703675623213, "learning_rate": 3.316717791411043e-05, "loss": 0.4813, "step": 584 }, { "epoch": 1.2088934850051707, "grad_norm": 0.3538816429524022, "learning_rate": 3.312883435582822e-05, "loss": 0.4964, "step": 585 }, { "epoch": 1.2109617373319546, "grad_norm": 0.3058342592305421, "learning_rate": 3.309049079754601e-05, "loss": 0.596, "step": 586 }, { "epoch": 1.2130299896587384, "grad_norm": 0.3948117714085258, "learning_rate": 3.3052147239263806e-05, "loss": 0.5559, "step": 587 }, { "epoch": 1.2150982419855223, "grad_norm": 0.3045447828576869, "learning_rate": 3.30138036809816e-05, "loss": 0.4863, "step": 588 }, { "epoch": 1.2171664943123062, "grad_norm": 0.3665043918635398, "learning_rate": 3.2975460122699384e-05, "loss": 0.5604, "step": 589 }, { "epoch": 1.21923474663909, "grad_norm": 0.33194603953910423, "learning_rate": 3.293711656441718e-05, "loss": 0.5125, "step": 590 }, { "epoch": 1.221302998965874, "grad_norm": 0.27562106319717444, "learning_rate": 3.289877300613497e-05, "loss": 0.5819, "step": 591 }, { "epoch": 1.2233712512926578, "grad_norm": 0.35156119039953665, "learning_rate": 3.286042944785276e-05, "loss": 0.4674, "step": 592 }, { "epoch": 1.2254395036194417, "grad_norm": 0.3267606422117779, "learning_rate": 3.282208588957055e-05, "loss": 0.4776, "step": 593 }, { "epoch": 1.2275077559462255, "grad_norm": 0.308616893053233, "learning_rate": 3.2783742331288346e-05, "loss": 0.5303, "step": 594 }, { "epoch": 1.2295760082730094, "grad_norm": 0.2909256878777864, "learning_rate": 3.274539877300614e-05, "loss": 0.5288, "step": 595 }, { "epoch": 1.2316442605997933, "grad_norm": 0.2867374842257631, "learning_rate": 3.270705521472393e-05, "loss": 0.5354, "step": 596 }, { "epoch": 1.2337125129265771, "grad_norm": 0.2812317626231946, "learning_rate": 3.266871165644172e-05, "loss": 0.5307, "step": 597 }, { "epoch": 1.235780765253361, "grad_norm": 0.25001336136643654, "learning_rate": 3.263036809815951e-05, "loss": 0.498, "step": 598 }, { "epoch": 1.2378490175801449, "grad_norm": 0.30381494459654823, "learning_rate": 3.25920245398773e-05, "loss": 0.5114, "step": 599 }, { "epoch": 1.2399172699069285, "grad_norm": 0.32025044678119163, "learning_rate": 3.255368098159509e-05, "loss": 0.4666, "step": 600 }, { "epoch": 1.2419855222337124, "grad_norm": 0.2627928627697657, "learning_rate": 3.2515337423312886e-05, "loss": 0.5217, "step": 601 }, { "epoch": 1.2440537745604963, "grad_norm": 0.30221269533938805, "learning_rate": 3.247699386503068e-05, "loss": 0.6172, "step": 602 }, { "epoch": 1.2461220268872801, "grad_norm": 0.32545652127805735, "learning_rate": 3.243865030674847e-05, "loss": 0.5561, "step": 603 }, { "epoch": 1.248190279214064, "grad_norm": 0.27258126452709397, "learning_rate": 3.2400306748466256e-05, "loss": 0.4569, "step": 604 }, { "epoch": 1.2502585315408479, "grad_norm": 0.2821639532716121, "learning_rate": 3.2361963190184055e-05, "loss": 0.5444, "step": 605 }, { "epoch": 1.2523267838676317, "grad_norm": 0.3160565708564914, "learning_rate": 3.232361963190184e-05, "loss": 0.515, "step": 606 }, { "epoch": 1.2543950361944156, "grad_norm": 0.30426994206100183, "learning_rate": 3.228527607361963e-05, "loss": 0.5573, "step": 607 }, { "epoch": 1.2564632885211995, "grad_norm": 0.3213784405898027, "learning_rate": 3.2246932515337426e-05, "loss": 0.5329, "step": 608 }, { "epoch": 1.2585315408479834, "grad_norm": 0.32134219158167987, "learning_rate": 3.220858895705521e-05, "loss": 0.5288, "step": 609 }, { "epoch": 1.2605997931747672, "grad_norm": 0.30180094438957794, "learning_rate": 3.217024539877301e-05, "loss": 0.531, "step": 610 }, { "epoch": 1.262668045501551, "grad_norm": 0.27373051046077607, "learning_rate": 3.2131901840490796e-05, "loss": 0.4642, "step": 611 }, { "epoch": 1.264736297828335, "grad_norm": 0.35020839652093305, "learning_rate": 3.209355828220859e-05, "loss": 0.5595, "step": 612 }, { "epoch": 1.2668045501551188, "grad_norm": 0.286933709521671, "learning_rate": 3.205521472392638e-05, "loss": 0.5211, "step": 613 }, { "epoch": 1.2688728024819027, "grad_norm": 0.3211816151230246, "learning_rate": 3.201687116564417e-05, "loss": 0.5545, "step": 614 }, { "epoch": 1.2709410548086866, "grad_norm": 0.3264875871317652, "learning_rate": 3.1978527607361966e-05, "loss": 0.6526, "step": 615 }, { "epoch": 1.2730093071354704, "grad_norm": 0.2976680064818853, "learning_rate": 3.194018404907976e-05, "loss": 0.52, "step": 616 }, { "epoch": 1.2750775594622543, "grad_norm": 0.3309677246734631, "learning_rate": 3.1901840490797544e-05, "loss": 0.5344, "step": 617 }, { "epoch": 1.2771458117890382, "grad_norm": 0.30319039280932397, "learning_rate": 3.186349693251534e-05, "loss": 0.4768, "step": 618 }, { "epoch": 1.279214064115822, "grad_norm": 0.31604945953628505, "learning_rate": 3.182515337423313e-05, "loss": 0.5295, "step": 619 }, { "epoch": 1.281282316442606, "grad_norm": 0.349126596989517, "learning_rate": 3.178680981595093e-05, "loss": 0.6212, "step": 620 }, { "epoch": 1.2833505687693898, "grad_norm": 0.3685075046759841, "learning_rate": 3.174846625766871e-05, "loss": 0.5316, "step": 621 }, { "epoch": 1.2854188210961737, "grad_norm": 0.3703497674344484, "learning_rate": 3.17101226993865e-05, "loss": 0.5909, "step": 622 }, { "epoch": 1.2874870734229575, "grad_norm": 0.41204808720814656, "learning_rate": 3.16717791411043e-05, "loss": 0.5727, "step": 623 }, { "epoch": 1.2895553257497414, "grad_norm": 0.349084146532956, "learning_rate": 3.1633435582822084e-05, "loss": 0.619, "step": 624 }, { "epoch": 1.2916235780765253, "grad_norm": 0.36465378077731725, "learning_rate": 3.159509202453988e-05, "loss": 0.5259, "step": 625 }, { "epoch": 1.2936918304033092, "grad_norm": 0.3181906100508291, "learning_rate": 3.155674846625767e-05, "loss": 0.5116, "step": 626 }, { "epoch": 1.295760082730093, "grad_norm": 0.3359379449383167, "learning_rate": 3.151840490797546e-05, "loss": 0.5753, "step": 627 }, { "epoch": 1.297828335056877, "grad_norm": 0.3333264824671679, "learning_rate": 3.148006134969325e-05, "loss": 0.5652, "step": 628 }, { "epoch": 1.2998965873836608, "grad_norm": 0.31257775297759877, "learning_rate": 3.1441717791411045e-05, "loss": 0.5033, "step": 629 }, { "epoch": 1.3019648397104446, "grad_norm": 0.4189964949339624, "learning_rate": 3.140337423312884e-05, "loss": 0.556, "step": 630 }, { "epoch": 1.3040330920372285, "grad_norm": 0.3192462313714596, "learning_rate": 3.136503067484663e-05, "loss": 0.5027, "step": 631 }, { "epoch": 1.3061013443640124, "grad_norm": 0.3784054893804669, "learning_rate": 3.1326687116564416e-05, "loss": 0.5369, "step": 632 }, { "epoch": 1.3081695966907962, "grad_norm": 0.29196359050342335, "learning_rate": 3.1288343558282215e-05, "loss": 0.493, "step": 633 }, { "epoch": 1.3102378490175801, "grad_norm": 0.3353609044174032, "learning_rate": 3.125e-05, "loss": 0.6121, "step": 634 }, { "epoch": 1.312306101344364, "grad_norm": 0.30767008591435646, "learning_rate": 3.121165644171779e-05, "loss": 0.5159, "step": 635 }, { "epoch": 1.3143743536711479, "grad_norm": 0.3400642900971622, "learning_rate": 3.1173312883435585e-05, "loss": 0.5668, "step": 636 }, { "epoch": 1.3164426059979317, "grad_norm": 0.3684217834076075, "learning_rate": 3.113496932515337e-05, "loss": 0.5991, "step": 637 }, { "epoch": 1.3185108583247156, "grad_norm": 0.3141619680261082, "learning_rate": 3.109662576687117e-05, "loss": 0.5328, "step": 638 }, { "epoch": 1.3205791106514995, "grad_norm": 0.37290735367502115, "learning_rate": 3.1058282208588956e-05, "loss": 0.623, "step": 639 }, { "epoch": 1.3226473629782833, "grad_norm": 0.905383472904807, "learning_rate": 3.101993865030675e-05, "loss": 0.6098, "step": 640 }, { "epoch": 1.3247156153050672, "grad_norm": 0.3060214403354029, "learning_rate": 3.098159509202454e-05, "loss": 0.5312, "step": 641 }, { "epoch": 1.326783867631851, "grad_norm": 0.43418079944850513, "learning_rate": 3.094325153374233e-05, "loss": 0.5836, "step": 642 }, { "epoch": 1.328852119958635, "grad_norm": 0.29290270574963667, "learning_rate": 3.0904907975460125e-05, "loss": 0.5074, "step": 643 }, { "epoch": 1.3309203722854188, "grad_norm": 0.3816412943516462, "learning_rate": 3.086656441717792e-05, "loss": 0.4681, "step": 644 }, { "epoch": 1.3329886246122027, "grad_norm": 0.2877230194752186, "learning_rate": 3.0828220858895703e-05, "loss": 0.51, "step": 645 }, { "epoch": 1.3350568769389866, "grad_norm": 0.3312008222936841, "learning_rate": 3.07898773006135e-05, "loss": 0.5806, "step": 646 }, { "epoch": 1.3371251292657704, "grad_norm": 0.3116418600687731, "learning_rate": 3.075153374233129e-05, "loss": 0.4675, "step": 647 }, { "epoch": 1.3391933815925543, "grad_norm": 0.38304971758088546, "learning_rate": 3.071319018404908e-05, "loss": 0.5399, "step": 648 }, { "epoch": 1.3412616339193382, "grad_norm": 0.33002715193393095, "learning_rate": 3.067484662576687e-05, "loss": 0.5029, "step": 649 }, { "epoch": 1.343329886246122, "grad_norm": 0.3669350177078939, "learning_rate": 3.0636503067484665e-05, "loss": 0.503, "step": 650 }, { "epoch": 1.345398138572906, "grad_norm": 0.33180000244766883, "learning_rate": 3.059815950920246e-05, "loss": 0.4942, "step": 651 }, { "epoch": 1.3474663908996898, "grad_norm": 0.36013480863945335, "learning_rate": 3.055981595092024e-05, "loss": 0.5254, "step": 652 }, { "epoch": 1.3495346432264737, "grad_norm": 0.3846643372382617, "learning_rate": 3.052147239263804e-05, "loss": 0.426, "step": 653 }, { "epoch": 1.3516028955532575, "grad_norm": 0.39528354069076904, "learning_rate": 3.0483128834355828e-05, "loss": 0.5718, "step": 654 }, { "epoch": 1.3536711478800414, "grad_norm": 0.28260619465705905, "learning_rate": 3.0444785276073624e-05, "loss": 0.5385, "step": 655 }, { "epoch": 1.3557394002068253, "grad_norm": 0.33002990860281595, "learning_rate": 3.0406441717791413e-05, "loss": 0.5361, "step": 656 }, { "epoch": 1.3578076525336091, "grad_norm": 0.33825395224894583, "learning_rate": 3.0368098159509205e-05, "loss": 0.5181, "step": 657 }, { "epoch": 1.359875904860393, "grad_norm": 0.33420435814611954, "learning_rate": 3.0329754601226994e-05, "loss": 0.5299, "step": 658 }, { "epoch": 1.3619441571871769, "grad_norm": 0.6587366420247192, "learning_rate": 3.029141104294479e-05, "loss": 0.5534, "step": 659 }, { "epoch": 1.3640124095139607, "grad_norm": 0.3650185380983616, "learning_rate": 3.025306748466258e-05, "loss": 0.4762, "step": 660 }, { "epoch": 1.3660806618407446, "grad_norm": 0.302795483254532, "learning_rate": 3.0214723926380368e-05, "loss": 0.5069, "step": 661 }, { "epoch": 1.3681489141675285, "grad_norm": 0.32851375307988917, "learning_rate": 3.017638036809816e-05, "loss": 0.5034, "step": 662 }, { "epoch": 1.3702171664943124, "grad_norm": 0.3075153937212787, "learning_rate": 3.013803680981595e-05, "loss": 0.5553, "step": 663 }, { "epoch": 1.3722854188210962, "grad_norm": 0.3462788914504136, "learning_rate": 3.0099693251533745e-05, "loss": 0.5377, "step": 664 }, { "epoch": 1.37435367114788, "grad_norm": 0.3652241207103308, "learning_rate": 3.0061349693251534e-05, "loss": 0.4638, "step": 665 }, { "epoch": 1.376421923474664, "grad_norm": 6.111245818530581, "learning_rate": 3.0023006134969327e-05, "loss": 0.5314, "step": 666 }, { "epoch": 1.3784901758014478, "grad_norm": 0.4657860201639562, "learning_rate": 2.9984662576687116e-05, "loss": 0.5152, "step": 667 }, { "epoch": 1.3805584281282317, "grad_norm": 0.31180202259966644, "learning_rate": 2.994631901840491e-05, "loss": 0.5668, "step": 668 }, { "epoch": 1.3826266804550156, "grad_norm": 0.4253476183926863, "learning_rate": 2.99079754601227e-05, "loss": 0.5161, "step": 669 }, { "epoch": 1.3846949327817994, "grad_norm": 0.3545809579980766, "learning_rate": 2.9869631901840496e-05, "loss": 0.5423, "step": 670 }, { "epoch": 1.3867631851085833, "grad_norm": 0.32256856708477105, "learning_rate": 2.9831288343558282e-05, "loss": 0.5482, "step": 671 }, { "epoch": 1.3888314374353672, "grad_norm": 0.4042733257749836, "learning_rate": 2.979294478527607e-05, "loss": 0.5746, "step": 672 }, { "epoch": 1.390899689762151, "grad_norm": 0.29481294633147664, "learning_rate": 2.9754601226993867e-05, "loss": 0.5104, "step": 673 }, { "epoch": 1.392967942088935, "grad_norm": 0.35191059266448516, "learning_rate": 2.9716257668711656e-05, "loss": 0.5092, "step": 674 }, { "epoch": 1.3950361944157188, "grad_norm": 0.33419893196378775, "learning_rate": 2.967791411042945e-05, "loss": 0.466, "step": 675 }, { "epoch": 1.3971044467425027, "grad_norm": 0.34368387582978877, "learning_rate": 2.963957055214724e-05, "loss": 0.5321, "step": 676 }, { "epoch": 1.3991726990692865, "grad_norm": 0.2847388972066702, "learning_rate": 2.9601226993865033e-05, "loss": 0.5155, "step": 677 }, { "epoch": 1.4012409513960704, "grad_norm": 0.2738850993353864, "learning_rate": 2.9562883435582822e-05, "loss": 0.5592, "step": 678 }, { "epoch": 1.4033092037228543, "grad_norm": 0.333667853957691, "learning_rate": 2.9524539877300618e-05, "loss": 0.5023, "step": 679 }, { "epoch": 1.4053774560496382, "grad_norm": 0.27097334941090634, "learning_rate": 2.9486196319018407e-05, "loss": 0.4765, "step": 680 }, { "epoch": 1.407445708376422, "grad_norm": 0.34572736114157787, "learning_rate": 2.94478527607362e-05, "loss": 0.5102, "step": 681 }, { "epoch": 1.409513960703206, "grad_norm": 0.31273920022828977, "learning_rate": 2.9409509202453988e-05, "loss": 0.4809, "step": 682 }, { "epoch": 1.4115822130299898, "grad_norm": 0.33734711390430555, "learning_rate": 2.9371165644171784e-05, "loss": 0.4939, "step": 683 }, { "epoch": 1.4136504653567736, "grad_norm": 0.312588179653306, "learning_rate": 2.9332822085889573e-05, "loss": 0.5494, "step": 684 }, { "epoch": 1.4157187176835575, "grad_norm": 0.31931799811912126, "learning_rate": 2.9294478527607362e-05, "loss": 0.5497, "step": 685 }, { "epoch": 1.4177869700103414, "grad_norm": 0.34386628298286376, "learning_rate": 2.9256134969325154e-05, "loss": 0.4984, "step": 686 }, { "epoch": 1.4198552223371252, "grad_norm": 0.30727262376765235, "learning_rate": 2.9217791411042943e-05, "loss": 0.5785, "step": 687 }, { "epoch": 1.4219234746639091, "grad_norm": 0.3597175107852496, "learning_rate": 2.917944785276074e-05, "loss": 0.586, "step": 688 }, { "epoch": 1.423991726990693, "grad_norm": 0.282825912685819, "learning_rate": 2.9141104294478528e-05, "loss": 0.5716, "step": 689 }, { "epoch": 1.4260599793174769, "grad_norm": 0.31860020271502354, "learning_rate": 2.910276073619632e-05, "loss": 0.5331, "step": 690 }, { "epoch": 1.4281282316442607, "grad_norm": 0.29973727898789176, "learning_rate": 2.906441717791411e-05, "loss": 0.5361, "step": 691 }, { "epoch": 1.4301964839710446, "grad_norm": 0.6339768815180811, "learning_rate": 2.9026073619631905e-05, "loss": 0.5379, "step": 692 }, { "epoch": 1.4322647362978285, "grad_norm": 0.3206089691700537, "learning_rate": 2.8987730061349694e-05, "loss": 0.5019, "step": 693 }, { "epoch": 1.4343329886246123, "grad_norm": 0.30801928805771817, "learning_rate": 2.894938650306749e-05, "loss": 0.4557, "step": 694 }, { "epoch": 1.4364012409513962, "grad_norm": 0.32238071842643135, "learning_rate": 2.8911042944785276e-05, "loss": 0.5343, "step": 695 }, { "epoch": 1.4384694932781799, "grad_norm": 1.8147470435924464, "learning_rate": 2.887269938650307e-05, "loss": 0.5843, "step": 696 }, { "epoch": 1.4405377456049637, "grad_norm": 0.35292708512880056, "learning_rate": 2.883435582822086e-05, "loss": 0.4899, "step": 697 }, { "epoch": 1.4426059979317476, "grad_norm": 0.3570535807282174, "learning_rate": 2.879601226993865e-05, "loss": 0.574, "step": 698 }, { "epoch": 1.4446742502585315, "grad_norm": 0.3047497710043228, "learning_rate": 2.8757668711656445e-05, "loss": 0.5166, "step": 699 }, { "epoch": 1.4467425025853153, "grad_norm": 0.3397256257558058, "learning_rate": 2.8719325153374234e-05, "loss": 0.4916, "step": 700 }, { "epoch": 1.4488107549120992, "grad_norm": 0.2848274598560013, "learning_rate": 2.8680981595092026e-05, "loss": 0.5191, "step": 701 }, { "epoch": 1.450879007238883, "grad_norm": 0.3131372957053315, "learning_rate": 2.8642638036809815e-05, "loss": 0.5293, "step": 702 }, { "epoch": 1.452947259565667, "grad_norm": 0.29834374061176183, "learning_rate": 2.860429447852761e-05, "loss": 0.5465, "step": 703 }, { "epoch": 1.4550155118924508, "grad_norm": 0.30998116285940824, "learning_rate": 2.85659509202454e-05, "loss": 0.5695, "step": 704 }, { "epoch": 1.4570837642192347, "grad_norm": 0.2790935012830894, "learning_rate": 2.8527607361963193e-05, "loss": 0.5041, "step": 705 }, { "epoch": 1.4591520165460186, "grad_norm": 0.33363396746309315, "learning_rate": 2.848926380368098e-05, "loss": 0.5838, "step": 706 }, { "epoch": 1.4612202688728024, "grad_norm": 0.27446902656866184, "learning_rate": 2.8450920245398777e-05, "loss": 0.5492, "step": 707 }, { "epoch": 1.4632885211995863, "grad_norm": 0.30021549746748216, "learning_rate": 2.8412576687116566e-05, "loss": 0.5388, "step": 708 }, { "epoch": 1.4653567735263702, "grad_norm": 0.2681607600295317, "learning_rate": 2.837423312883436e-05, "loss": 0.4899, "step": 709 }, { "epoch": 1.467425025853154, "grad_norm": 1.3079355537676651, "learning_rate": 2.8335889570552148e-05, "loss": 0.4956, "step": 710 }, { "epoch": 1.469493278179938, "grad_norm": 0.29981071996663833, "learning_rate": 2.8297546012269937e-05, "loss": 0.5136, "step": 711 }, { "epoch": 1.4715615305067218, "grad_norm": 0.34378464819876753, "learning_rate": 2.8259202453987733e-05, "loss": 0.506, "step": 712 }, { "epoch": 1.4736297828335057, "grad_norm": 0.37119203523401945, "learning_rate": 2.822085889570552e-05, "loss": 0.5879, "step": 713 }, { "epoch": 1.4756980351602895, "grad_norm": 0.36290990352182406, "learning_rate": 2.8182515337423314e-05, "loss": 0.5387, "step": 714 }, { "epoch": 1.4777662874870734, "grad_norm": 0.37967141571275215, "learning_rate": 2.8144171779141103e-05, "loss": 0.4459, "step": 715 }, { "epoch": 1.4798345398138573, "grad_norm": 0.3572296224279994, "learning_rate": 2.81058282208589e-05, "loss": 0.5088, "step": 716 }, { "epoch": 1.4819027921406411, "grad_norm": 0.3883791212859971, "learning_rate": 2.8067484662576688e-05, "loss": 0.5658, "step": 717 }, { "epoch": 1.483971044467425, "grad_norm": 0.350818013768724, "learning_rate": 2.802914110429448e-05, "loss": 0.5597, "step": 718 }, { "epoch": 1.4860392967942089, "grad_norm": 0.3356317945321125, "learning_rate": 2.799079754601227e-05, "loss": 0.5268, "step": 719 }, { "epoch": 1.4881075491209927, "grad_norm": 0.3482789693030063, "learning_rate": 2.7952453987730065e-05, "loss": 0.468, "step": 720 }, { "epoch": 1.4901758014477766, "grad_norm": 0.3160233697407584, "learning_rate": 2.7914110429447854e-05, "loss": 0.505, "step": 721 }, { "epoch": 1.4922440537745605, "grad_norm": 0.2871568016551331, "learning_rate": 2.7875766871165643e-05, "loss": 0.5451, "step": 722 }, { "epoch": 1.4943123061013444, "grad_norm": 0.34729175041939836, "learning_rate": 2.783742331288344e-05, "loss": 0.5885, "step": 723 }, { "epoch": 1.4963805584281282, "grad_norm": 0.32442941669066966, "learning_rate": 2.7799079754601224e-05, "loss": 0.4625, "step": 724 }, { "epoch": 1.498448810754912, "grad_norm": 0.31315207728106337, "learning_rate": 2.776073619631902e-05, "loss": 0.4824, "step": 725 }, { "epoch": 1.500517063081696, "grad_norm": 0.3392318869950744, "learning_rate": 2.772239263803681e-05, "loss": 0.521, "step": 726 }, { "epoch": 1.5025853154084798, "grad_norm": 0.31523048278409177, "learning_rate": 2.7684049079754605e-05, "loss": 0.5936, "step": 727 }, { "epoch": 1.5046535677352637, "grad_norm": 0.39259962340033905, "learning_rate": 2.7645705521472394e-05, "loss": 0.5852, "step": 728 }, { "epoch": 1.5067218200620476, "grad_norm": 0.3528588011574089, "learning_rate": 2.7607361963190186e-05, "loss": 0.5483, "step": 729 }, { "epoch": 1.5087900723888314, "grad_norm": 0.3156890244637162, "learning_rate": 2.7569018404907975e-05, "loss": 0.4687, "step": 730 }, { "epoch": 1.5108583247156153, "grad_norm": 0.3178641245142643, "learning_rate": 2.753067484662577e-05, "loss": 0.4638, "step": 731 }, { "epoch": 1.5129265770423992, "grad_norm": 0.39429931861661366, "learning_rate": 2.749233128834356e-05, "loss": 0.5193, "step": 732 }, { "epoch": 1.514994829369183, "grad_norm": 0.32604679255307367, "learning_rate": 2.7453987730061353e-05, "loss": 0.5181, "step": 733 }, { "epoch": 1.517063081695967, "grad_norm": 0.3524324760658334, "learning_rate": 2.741564417177914e-05, "loss": 0.4731, "step": 734 }, { "epoch": 1.5191313340227508, "grad_norm": 0.33046751258500423, "learning_rate": 2.737730061349693e-05, "loss": 0.4751, "step": 735 }, { "epoch": 1.5211995863495347, "grad_norm": 0.39163491514881604, "learning_rate": 2.7338957055214726e-05, "loss": 0.5316, "step": 736 }, { "epoch": 1.5232678386763185, "grad_norm": 0.5266273898583848, "learning_rate": 2.7300613496932515e-05, "loss": 0.5999, "step": 737 }, { "epoch": 1.5253360910031024, "grad_norm": 0.41680779777728166, "learning_rate": 2.7262269938650308e-05, "loss": 0.5192, "step": 738 }, { "epoch": 1.5274043433298863, "grad_norm": 0.28946370653605336, "learning_rate": 2.7223926380368097e-05, "loss": 0.5555, "step": 739 }, { "epoch": 1.5294725956566702, "grad_norm": 0.3416406282764261, "learning_rate": 2.7185582822085892e-05, "loss": 0.4718, "step": 740 }, { "epoch": 1.531540847983454, "grad_norm": 0.3874213770100785, "learning_rate": 2.714723926380368e-05, "loss": 0.519, "step": 741 }, { "epoch": 1.533609100310238, "grad_norm": 0.33286373120701945, "learning_rate": 2.7108895705521474e-05, "loss": 0.5198, "step": 742 }, { "epoch": 1.5356773526370218, "grad_norm": 0.3325814891618725, "learning_rate": 2.7070552147239263e-05, "loss": 0.5112, "step": 743 }, { "epoch": 1.5377456049638056, "grad_norm": 0.4100481255152612, "learning_rate": 2.703220858895706e-05, "loss": 0.6715, "step": 744 }, { "epoch": 1.5398138572905895, "grad_norm": 0.31093676117355434, "learning_rate": 2.6993865030674848e-05, "loss": 0.5772, "step": 745 }, { "epoch": 1.5418821096173732, "grad_norm": 0.30842925836641494, "learning_rate": 2.6955521472392643e-05, "loss": 0.5596, "step": 746 }, { "epoch": 1.543950361944157, "grad_norm": 0.7977149395783424, "learning_rate": 2.6917177914110432e-05, "loss": 0.5182, "step": 747 }, { "epoch": 1.546018614270941, "grad_norm": 0.3205340018892863, "learning_rate": 2.6878834355828218e-05, "loss": 0.7287, "step": 748 }, { "epoch": 1.5480868665977248, "grad_norm": 2.2874588995506344, "learning_rate": 2.6840490797546014e-05, "loss": 0.5552, "step": 749 }, { "epoch": 1.5501551189245086, "grad_norm": 0.3180691125031402, "learning_rate": 2.6802147239263803e-05, "loss": 0.5715, "step": 750 }, { "epoch": 1.5522233712512925, "grad_norm": 0.4191403812883971, "learning_rate": 2.67638036809816e-05, "loss": 0.6083, "step": 751 }, { "epoch": 1.5542916235780764, "grad_norm": 0.31029131226590795, "learning_rate": 2.6725460122699388e-05, "loss": 0.5398, "step": 752 }, { "epoch": 1.5563598759048602, "grad_norm": 0.3367756367584682, "learning_rate": 2.668711656441718e-05, "loss": 0.5258, "step": 753 }, { "epoch": 1.5584281282316441, "grad_norm": 0.3287646868242513, "learning_rate": 2.664877300613497e-05, "loss": 0.5388, "step": 754 }, { "epoch": 1.560496380558428, "grad_norm": 0.32632878129624476, "learning_rate": 2.6610429447852765e-05, "loss": 0.475, "step": 755 }, { "epoch": 1.5625646328852119, "grad_norm": 0.34552884523539884, "learning_rate": 2.6572085889570554e-05, "loss": 0.5684, "step": 756 }, { "epoch": 1.5646328852119957, "grad_norm": 0.3910491942294599, "learning_rate": 2.6533742331288346e-05, "loss": 0.5335, "step": 757 }, { "epoch": 1.5667011375387796, "grad_norm": 3.2137991938709334, "learning_rate": 2.6495398773006135e-05, "loss": 0.6906, "step": 758 }, { "epoch": 1.5687693898655635, "grad_norm": 0.3065501147948008, "learning_rate": 2.645705521472393e-05, "loss": 0.5903, "step": 759 }, { "epoch": 1.5708376421923473, "grad_norm": 0.3991250189613925, "learning_rate": 2.641871165644172e-05, "loss": 0.577, "step": 760 }, { "epoch": 1.5729058945191312, "grad_norm": 0.3608343280277744, "learning_rate": 2.638036809815951e-05, "loss": 0.5726, "step": 761 }, { "epoch": 1.574974146845915, "grad_norm": 0.3746679352375075, "learning_rate": 2.63420245398773e-05, "loss": 0.5146, "step": 762 }, { "epoch": 1.577042399172699, "grad_norm": 0.32423458230205443, "learning_rate": 2.630368098159509e-05, "loss": 0.4809, "step": 763 }, { "epoch": 1.5791106514994828, "grad_norm": 0.32577697857772453, "learning_rate": 2.6265337423312886e-05, "loss": 0.4989, "step": 764 }, { "epoch": 1.5811789038262667, "grad_norm": 0.33616383380962767, "learning_rate": 2.6226993865030675e-05, "loss": 0.5332, "step": 765 }, { "epoch": 1.5832471561530506, "grad_norm": 0.3935307328747496, "learning_rate": 2.6188650306748468e-05, "loss": 0.4986, "step": 766 }, { "epoch": 1.5853154084798344, "grad_norm": 0.27527420357552645, "learning_rate": 2.6150306748466257e-05, "loss": 0.4744, "step": 767 }, { "epoch": 1.5873836608066183, "grad_norm": 0.28024651744646817, "learning_rate": 2.6111963190184052e-05, "loss": 0.4883, "step": 768 }, { "epoch": 1.5894519131334022, "grad_norm": 0.34167666341785075, "learning_rate": 2.607361963190184e-05, "loss": 0.4932, "step": 769 }, { "epoch": 1.591520165460186, "grad_norm": 0.2880449639046094, "learning_rate": 2.6035276073619637e-05, "loss": 0.5131, "step": 770 }, { "epoch": 1.59358841778697, "grad_norm": 0.25111607313930817, "learning_rate": 2.5996932515337423e-05, "loss": 0.4787, "step": 771 }, { "epoch": 1.5956566701137538, "grad_norm": 0.27333372025257574, "learning_rate": 2.5958588957055212e-05, "loss": 0.5051, "step": 772 }, { "epoch": 1.5977249224405377, "grad_norm": 0.3316194326201587, "learning_rate": 2.5920245398773008e-05, "loss": 0.4587, "step": 773 }, { "epoch": 1.5997931747673215, "grad_norm": 0.28712758279930994, "learning_rate": 2.5881901840490797e-05, "loss": 0.4787, "step": 774 }, { "epoch": 1.6018614270941054, "grad_norm": 0.26096637755945534, "learning_rate": 2.5843558282208592e-05, "loss": 0.4752, "step": 775 }, { "epoch": 1.6039296794208893, "grad_norm": 0.30149105671219134, "learning_rate": 2.580521472392638e-05, "loss": 0.4978, "step": 776 }, { "epoch": 1.6059979317476731, "grad_norm": 0.2941684106557261, "learning_rate": 2.5766871165644174e-05, "loss": 0.5257, "step": 777 }, { "epoch": 1.608066184074457, "grad_norm": 0.3429750092604878, "learning_rate": 2.5728527607361963e-05, "loss": 0.6043, "step": 778 }, { "epoch": 1.6101344364012409, "grad_norm": 0.27362477811533986, "learning_rate": 2.569018404907976e-05, "loss": 0.4682, "step": 779 }, { "epoch": 1.6122026887280247, "grad_norm": 0.29262057500693583, "learning_rate": 2.5651840490797547e-05, "loss": 0.4742, "step": 780 }, { "epoch": 1.6142709410548086, "grad_norm": 0.30897815229555736, "learning_rate": 2.561349693251534e-05, "loss": 0.4444, "step": 781 }, { "epoch": 1.6163391933815925, "grad_norm": 0.2967617023451551, "learning_rate": 2.557515337423313e-05, "loss": 0.5439, "step": 782 }, { "epoch": 1.6184074457083764, "grad_norm": 0.2768956112280975, "learning_rate": 2.5536809815950925e-05, "loss": 0.5512, "step": 783 }, { "epoch": 1.6204756980351602, "grad_norm": 0.3762691643043268, "learning_rate": 2.5498466257668714e-05, "loss": 0.4993, "step": 784 }, { "epoch": 1.622543950361944, "grad_norm": 0.35555752599349183, "learning_rate": 2.5460122699386503e-05, "loss": 0.5265, "step": 785 }, { "epoch": 1.624612202688728, "grad_norm": 0.3102024412097539, "learning_rate": 2.5421779141104295e-05, "loss": 0.5503, "step": 786 }, { "epoch": 1.6266804550155118, "grad_norm": 3.227995400112414, "learning_rate": 2.5383435582822084e-05, "loss": 0.5251, "step": 787 }, { "epoch": 1.6287487073422957, "grad_norm": 0.4095303305259052, "learning_rate": 2.534509202453988e-05, "loss": 0.5639, "step": 788 }, { "epoch": 1.6308169596690796, "grad_norm": 0.36643495515920754, "learning_rate": 2.530674846625767e-05, "loss": 0.5093, "step": 789 }, { "epoch": 1.6328852119958635, "grad_norm": 0.398902214606772, "learning_rate": 2.526840490797546e-05, "loss": 0.518, "step": 790 }, { "epoch": 1.6349534643226473, "grad_norm": 0.2951439760342964, "learning_rate": 2.523006134969325e-05, "loss": 0.5622, "step": 791 }, { "epoch": 1.6370217166494312, "grad_norm": 0.5772060775722817, "learning_rate": 2.5191717791411046e-05, "loss": 0.4941, "step": 792 }, { "epoch": 1.639089968976215, "grad_norm": 0.3658267915409293, "learning_rate": 2.5153374233128835e-05, "loss": 0.4922, "step": 793 }, { "epoch": 1.641158221302999, "grad_norm": 0.3354500454768684, "learning_rate": 2.5115030674846627e-05, "loss": 0.456, "step": 794 }, { "epoch": 1.6432264736297828, "grad_norm": 0.7903759941262689, "learning_rate": 2.5076687116564416e-05, "loss": 0.4865, "step": 795 }, { "epoch": 1.6452947259565667, "grad_norm": 0.36357093415593456, "learning_rate": 2.5038343558282212e-05, "loss": 0.5499, "step": 796 }, { "epoch": 1.6473629782833505, "grad_norm": 0.3363024957315538, "learning_rate": 2.5e-05, "loss": 0.515, "step": 797 }, { "epoch": 1.6494312306101344, "grad_norm": 0.29483023586584134, "learning_rate": 2.4961656441717794e-05, "loss": 0.5779, "step": 798 }, { "epoch": 1.6514994829369183, "grad_norm": 0.36656748364572833, "learning_rate": 2.4923312883435586e-05, "loss": 0.5833, "step": 799 }, { "epoch": 1.6535677352637022, "grad_norm": 0.3375897932867393, "learning_rate": 2.4884969325153375e-05, "loss": 0.5552, "step": 800 }, { "epoch": 1.655635987590486, "grad_norm": 0.2820219588819253, "learning_rate": 2.4846625766871167e-05, "loss": 0.4981, "step": 801 }, { "epoch": 1.65770423991727, "grad_norm": 0.279831522791891, "learning_rate": 2.480828220858896e-05, "loss": 0.5532, "step": 802 }, { "epoch": 1.6597724922440538, "grad_norm": 0.31214674372509554, "learning_rate": 2.4769938650306752e-05, "loss": 0.5368, "step": 803 }, { "epoch": 1.6618407445708376, "grad_norm": 0.2671286270577531, "learning_rate": 2.473159509202454e-05, "loss": 0.5085, "step": 804 }, { "epoch": 1.6639089968976215, "grad_norm": 0.3307175065276503, "learning_rate": 2.469325153374233e-05, "loss": 0.5655, "step": 805 }, { "epoch": 1.6659772492244054, "grad_norm": 0.3227407364555052, "learning_rate": 2.4654907975460123e-05, "loss": 0.5015, "step": 806 }, { "epoch": 1.6680455015511892, "grad_norm": 0.4387925468743761, "learning_rate": 2.4616564417177915e-05, "loss": 0.5955, "step": 807 }, { "epoch": 1.6701137538779731, "grad_norm": 0.305517927233191, "learning_rate": 2.4578220858895707e-05, "loss": 0.4516, "step": 808 }, { "epoch": 1.672182006204757, "grad_norm": 0.3084330255540478, "learning_rate": 2.4539877300613496e-05, "loss": 0.5311, "step": 809 }, { "epoch": 1.6742502585315409, "grad_norm": 0.3584569620674896, "learning_rate": 2.450153374233129e-05, "loss": 0.5454, "step": 810 }, { "epoch": 1.6763185108583247, "grad_norm": 0.33657423180891055, "learning_rate": 2.446319018404908e-05, "loss": 0.5618, "step": 811 }, { "epoch": 1.6783867631851086, "grad_norm": 0.3021846975255988, "learning_rate": 2.4424846625766874e-05, "loss": 0.6155, "step": 812 }, { "epoch": 1.6804550155118925, "grad_norm": 0.3744940121790051, "learning_rate": 2.4386503067484666e-05, "loss": 0.5567, "step": 813 }, { "epoch": 1.6825232678386763, "grad_norm": 0.35445163602746116, "learning_rate": 2.4348159509202455e-05, "loss": 0.6291, "step": 814 }, { "epoch": 1.6845915201654602, "grad_norm": 3.1298463396870972, "learning_rate": 2.4309815950920247e-05, "loss": 0.5481, "step": 815 }, { "epoch": 1.686659772492244, "grad_norm": 0.3655593339504824, "learning_rate": 2.4271472392638036e-05, "loss": 0.5267, "step": 816 }, { "epoch": 1.688728024819028, "grad_norm": 0.37907110590056226, "learning_rate": 2.423312883435583e-05, "loss": 0.553, "step": 817 }, { "epoch": 1.6907962771458118, "grad_norm": 0.2598717392438581, "learning_rate": 2.419478527607362e-05, "loss": 0.6069, "step": 818 }, { "epoch": 1.6928645294725957, "grad_norm": 0.30269463889777143, "learning_rate": 2.415644171779141e-05, "loss": 0.4973, "step": 819 }, { "epoch": 1.6949327817993796, "grad_norm": 0.4101184452650099, "learning_rate": 2.4118098159509202e-05, "loss": 0.5112, "step": 820 }, { "epoch": 1.6970010341261634, "grad_norm": 0.2986758258298328, "learning_rate": 2.4079754601226995e-05, "loss": 0.5108, "step": 821 }, { "epoch": 1.6990692864529473, "grad_norm": 0.30253573445766896, "learning_rate": 2.4041411042944787e-05, "loss": 0.5154, "step": 822 }, { "epoch": 1.7011375387797312, "grad_norm": 0.29184628070901414, "learning_rate": 2.400306748466258e-05, "loss": 0.5672, "step": 823 }, { "epoch": 1.703205791106515, "grad_norm": 0.3249959095483481, "learning_rate": 2.396472392638037e-05, "loss": 0.545, "step": 824 }, { "epoch": 1.705274043433299, "grad_norm": 0.3248835723101898, "learning_rate": 2.392638036809816e-05, "loss": 0.4609, "step": 825 }, { "epoch": 1.7073422957600828, "grad_norm": 0.25551929172341026, "learning_rate": 2.3888036809815953e-05, "loss": 0.5108, "step": 826 }, { "epoch": 1.7094105480868667, "grad_norm": 0.33433079892760126, "learning_rate": 2.3849693251533746e-05, "loss": 0.5899, "step": 827 }, { "epoch": 1.7114788004136505, "grad_norm": 0.3137031529652779, "learning_rate": 2.3811349693251535e-05, "loss": 0.568, "step": 828 }, { "epoch": 1.7135470527404344, "grad_norm": 0.333426402289389, "learning_rate": 2.3773006134969324e-05, "loss": 0.5336, "step": 829 }, { "epoch": 1.7156153050672183, "grad_norm": 0.27218512078012475, "learning_rate": 2.3734662576687116e-05, "loss": 0.4896, "step": 830 }, { "epoch": 1.7176835573940021, "grad_norm": 0.3243540081648789, "learning_rate": 2.369631901840491e-05, "loss": 0.5644, "step": 831 }, { "epoch": 1.719751809720786, "grad_norm": 0.35443663557757493, "learning_rate": 2.36579754601227e-05, "loss": 0.4653, "step": 832 }, { "epoch": 1.7218200620475699, "grad_norm": 0.2765701746934585, "learning_rate": 2.361963190184049e-05, "loss": 0.538, "step": 833 }, { "epoch": 1.7238883143743537, "grad_norm": 0.2936651338196755, "learning_rate": 2.3581288343558282e-05, "loss": 0.6107, "step": 834 }, { "epoch": 1.7259565667011376, "grad_norm": 0.3029297266323485, "learning_rate": 2.3542944785276075e-05, "loss": 0.4472, "step": 835 }, { "epoch": 1.7280248190279215, "grad_norm": 0.29219277303804736, "learning_rate": 2.3504601226993867e-05, "loss": 0.5036, "step": 836 }, { "epoch": 1.7300930713547054, "grad_norm": 0.32497182077669756, "learning_rate": 2.346625766871166e-05, "loss": 0.5132, "step": 837 }, { "epoch": 1.7321613236814892, "grad_norm": 0.2791848667122403, "learning_rate": 2.342791411042945e-05, "loss": 0.556, "step": 838 }, { "epoch": 1.734229576008273, "grad_norm": 0.32218553484628537, "learning_rate": 2.338957055214724e-05, "loss": 0.5614, "step": 839 }, { "epoch": 1.736297828335057, "grad_norm": 0.296855017351657, "learning_rate": 2.3351226993865033e-05, "loss": 0.591, "step": 840 }, { "epoch": 1.7383660806618408, "grad_norm": 0.3048781624913469, "learning_rate": 2.3312883435582822e-05, "loss": 0.5354, "step": 841 }, { "epoch": 1.7404343329886247, "grad_norm": 0.35894054452897617, "learning_rate": 2.3274539877300615e-05, "loss": 0.5332, "step": 842 }, { "epoch": 1.7425025853154086, "grad_norm": 0.30511901480530335, "learning_rate": 2.3236196319018404e-05, "loss": 0.6042, "step": 843 }, { "epoch": 1.7445708376421925, "grad_norm": 0.2884176533593159, "learning_rate": 2.3197852760736196e-05, "loss": 0.5348, "step": 844 }, { "epoch": 1.7466390899689763, "grad_norm": 0.31040480711251733, "learning_rate": 2.315950920245399e-05, "loss": 0.4794, "step": 845 }, { "epoch": 1.7487073422957602, "grad_norm": 0.2778506797099404, "learning_rate": 2.312116564417178e-05, "loss": 0.6287, "step": 846 }, { "epoch": 1.750775594622544, "grad_norm": 3.6193673129819843, "learning_rate": 2.308282208588957e-05, "loss": 0.5413, "step": 847 }, { "epoch": 1.752843846949328, "grad_norm": 0.3950933978284187, "learning_rate": 2.3044478527607362e-05, "loss": 0.4356, "step": 848 }, { "epoch": 1.7549120992761118, "grad_norm": 0.315231312727176, "learning_rate": 2.3006134969325155e-05, "loss": 0.5162, "step": 849 }, { "epoch": 1.7569803516028957, "grad_norm": 0.28095945843075104, "learning_rate": 2.2967791411042947e-05, "loss": 0.6356, "step": 850 }, { "epoch": 1.7590486039296795, "grad_norm": 0.31896096282066044, "learning_rate": 2.292944785276074e-05, "loss": 0.5575, "step": 851 }, { "epoch": 1.7611168562564634, "grad_norm": 0.35984808313022293, "learning_rate": 2.289110429447853e-05, "loss": 0.4823, "step": 852 }, { "epoch": 1.7631851085832473, "grad_norm": 0.3071740404825863, "learning_rate": 2.285276073619632e-05, "loss": 0.5177, "step": 853 }, { "epoch": 1.7652533609100312, "grad_norm": 0.28197713008782416, "learning_rate": 2.281441717791411e-05, "loss": 0.5852, "step": 854 }, { "epoch": 1.767321613236815, "grad_norm": 0.33403774478404075, "learning_rate": 2.2776073619631902e-05, "loss": 0.5393, "step": 855 }, { "epoch": 1.769389865563599, "grad_norm": 0.2870899223588093, "learning_rate": 2.2737730061349695e-05, "loss": 0.5035, "step": 856 }, { "epoch": 1.7714581178903828, "grad_norm": 0.3335043678780113, "learning_rate": 2.2699386503067484e-05, "loss": 0.4811, "step": 857 }, { "epoch": 1.7735263702171666, "grad_norm": 0.29391106935735095, "learning_rate": 2.2661042944785276e-05, "loss": 0.5307, "step": 858 }, { "epoch": 1.7755946225439505, "grad_norm": 0.29840919835641294, "learning_rate": 2.262269938650307e-05, "loss": 0.5632, "step": 859 }, { "epoch": 1.7776628748707344, "grad_norm": 0.319832047916295, "learning_rate": 2.258435582822086e-05, "loss": 0.5335, "step": 860 }, { "epoch": 1.7797311271975182, "grad_norm": 0.31683212235003194, "learning_rate": 2.2546012269938653e-05, "loss": 0.5066, "step": 861 }, { "epoch": 1.7817993795243021, "grad_norm": 0.3205315175517229, "learning_rate": 2.2507668711656442e-05, "loss": 0.4838, "step": 862 }, { "epoch": 1.783867631851086, "grad_norm": 0.2829701601342136, "learning_rate": 2.2469325153374235e-05, "loss": 0.5054, "step": 863 }, { "epoch": 1.7859358841778699, "grad_norm": 0.2672152123473509, "learning_rate": 2.2430981595092027e-05, "loss": 0.5349, "step": 864 }, { "epoch": 1.7880041365046537, "grad_norm": 0.3080983816808202, "learning_rate": 2.239263803680982e-05, "loss": 0.5247, "step": 865 }, { "epoch": 1.7900723888314376, "grad_norm": 0.27222640277780347, "learning_rate": 2.235429447852761e-05, "loss": 0.506, "step": 866 }, { "epoch": 1.7921406411582212, "grad_norm": 0.2554457233967765, "learning_rate": 2.2315950920245397e-05, "loss": 0.5796, "step": 867 }, { "epoch": 1.7942088934850051, "grad_norm": 0.3292008707318216, "learning_rate": 2.227760736196319e-05, "loss": 0.5079, "step": 868 }, { "epoch": 1.796277145811789, "grad_norm": 0.2960287952774294, "learning_rate": 2.2239263803680982e-05, "loss": 0.5435, "step": 869 }, { "epoch": 1.7983453981385729, "grad_norm": 0.2626017596386058, "learning_rate": 2.2200920245398775e-05, "loss": 0.5814, "step": 870 }, { "epoch": 1.8004136504653567, "grad_norm": 0.2952851627155322, "learning_rate": 2.2162576687116564e-05, "loss": 0.5494, "step": 871 }, { "epoch": 1.8024819027921406, "grad_norm": 0.2827508494534035, "learning_rate": 2.2124233128834356e-05, "loss": 0.5271, "step": 872 }, { "epoch": 1.8045501551189245, "grad_norm": 0.2866825469998052, "learning_rate": 2.208588957055215e-05, "loss": 0.5977, "step": 873 }, { "epoch": 1.8066184074457083, "grad_norm": 2.1328066654431828, "learning_rate": 2.204754601226994e-05, "loss": 0.5569, "step": 874 }, { "epoch": 1.8086866597724922, "grad_norm": 0.29809482107956886, "learning_rate": 2.2009202453987733e-05, "loss": 0.6054, "step": 875 }, { "epoch": 1.810754912099276, "grad_norm": 0.27619220149837137, "learning_rate": 2.1970858895705522e-05, "loss": 1.2524, "step": 876 }, { "epoch": 1.81282316442606, "grad_norm": 7.105296818767888, "learning_rate": 2.1932515337423315e-05, "loss": 0.4529, "step": 877 }, { "epoch": 1.8148914167528438, "grad_norm": 0.3641562852636539, "learning_rate": 2.1894171779141107e-05, "loss": 0.5449, "step": 878 }, { "epoch": 1.8169596690796277, "grad_norm": 0.24926408367969538, "learning_rate": 2.1855828220858896e-05, "loss": 0.512, "step": 879 }, { "epoch": 1.8190279214064116, "grad_norm": 0.2774601038225622, "learning_rate": 2.181748466257669e-05, "loss": 0.4842, "step": 880 }, { "epoch": 1.8210961737331954, "grad_norm": 0.30751602647863835, "learning_rate": 2.1779141104294477e-05, "loss": 0.4863, "step": 881 }, { "epoch": 1.8231644260599793, "grad_norm": 0.2849329653773783, "learning_rate": 2.174079754601227e-05, "loss": 0.619, "step": 882 }, { "epoch": 1.8252326783867632, "grad_norm": 0.31774482586368075, "learning_rate": 2.1702453987730062e-05, "loss": 0.5619, "step": 883 }, { "epoch": 1.827300930713547, "grad_norm": 0.3205223371229409, "learning_rate": 2.1664110429447855e-05, "loss": 0.4931, "step": 884 }, { "epoch": 1.829369183040331, "grad_norm": 0.29769295767209436, "learning_rate": 2.1625766871165647e-05, "loss": 0.4779, "step": 885 }, { "epoch": 1.8314374353671148, "grad_norm": 0.30586597095881113, "learning_rate": 2.1587423312883436e-05, "loss": 0.5635, "step": 886 }, { "epoch": 1.8335056876938987, "grad_norm": 0.2802143796754693, "learning_rate": 2.154907975460123e-05, "loss": 0.5428, "step": 887 }, { "epoch": 1.8355739400206825, "grad_norm": 0.27035353450508615, "learning_rate": 2.151073619631902e-05, "loss": 0.5331, "step": 888 }, { "epoch": 1.8376421923474664, "grad_norm": 0.3443268687698467, "learning_rate": 2.1472392638036813e-05, "loss": 0.4888, "step": 889 }, { "epoch": 1.8397104446742503, "grad_norm": 0.3000281149365778, "learning_rate": 2.1434049079754602e-05, "loss": 0.44, "step": 890 }, { "epoch": 1.8417786970010341, "grad_norm": 0.29297427769292617, "learning_rate": 2.1395705521472395e-05, "loss": 0.5394, "step": 891 }, { "epoch": 1.843846949327818, "grad_norm": 0.3286074301782052, "learning_rate": 2.1357361963190184e-05, "loss": 0.4653, "step": 892 }, { "epoch": 1.8459152016546019, "grad_norm": 0.31186394877417695, "learning_rate": 2.1319018404907976e-05, "loss": 0.5881, "step": 893 }, { "epoch": 1.8479834539813857, "grad_norm": 0.2748964451261101, "learning_rate": 2.1280674846625768e-05, "loss": 0.5319, "step": 894 }, { "epoch": 1.8500517063081696, "grad_norm": 0.32198987099207094, "learning_rate": 2.1242331288343557e-05, "loss": 0.5741, "step": 895 }, { "epoch": 1.8521199586349535, "grad_norm": 0.2874315735364144, "learning_rate": 2.120398773006135e-05, "loss": 0.6203, "step": 896 }, { "epoch": 1.8541882109617374, "grad_norm": 0.28916247290544916, "learning_rate": 2.1165644171779142e-05, "loss": 0.5715, "step": 897 }, { "epoch": 1.8562564632885212, "grad_norm": 0.3104173768629215, "learning_rate": 2.1127300613496934e-05, "loss": 0.6434, "step": 898 }, { "epoch": 1.858324715615305, "grad_norm": 0.296081041019624, "learning_rate": 2.1088957055214727e-05, "loss": 0.5444, "step": 899 }, { "epoch": 1.860392967942089, "grad_norm": 0.3036235297781156, "learning_rate": 2.1050613496932516e-05, "loss": 0.5039, "step": 900 }, { "epoch": 1.8624612202688728, "grad_norm": 0.28337052702942295, "learning_rate": 2.1012269938650308e-05, "loss": 0.5586, "step": 901 }, { "epoch": 1.8645294725956567, "grad_norm": 0.2855455671410922, "learning_rate": 2.09739263803681e-05, "loss": 0.5609, "step": 902 }, { "epoch": 1.8665977249224406, "grad_norm": 0.29385374896004607, "learning_rate": 2.0935582822085893e-05, "loss": 0.544, "step": 903 }, { "epoch": 1.8686659772492245, "grad_norm": 0.2608377916404988, "learning_rate": 2.0897239263803682e-05, "loss": 0.4858, "step": 904 }, { "epoch": 1.8707342295760083, "grad_norm": 0.3099818978353369, "learning_rate": 2.085889570552147e-05, "loss": 0.6485, "step": 905 }, { "epoch": 1.8728024819027922, "grad_norm": 0.3064573272439758, "learning_rate": 2.0820552147239263e-05, "loss": 0.5133, "step": 906 }, { "epoch": 1.874870734229576, "grad_norm": 0.27328891827670243, "learning_rate": 2.0782208588957056e-05, "loss": 0.566, "step": 907 }, { "epoch": 1.8769389865563597, "grad_norm": 0.30813032718467304, "learning_rate": 2.0743865030674848e-05, "loss": 0.4552, "step": 908 }, { "epoch": 1.8790072388831436, "grad_norm": 0.2827525651658214, "learning_rate": 2.0705521472392637e-05, "loss": 0.616, "step": 909 }, { "epoch": 1.8810754912099275, "grad_norm": 0.24337211481445553, "learning_rate": 2.066717791411043e-05, "loss": 0.504, "step": 910 }, { "epoch": 1.8831437435367113, "grad_norm": 0.2637089170992205, "learning_rate": 2.0628834355828222e-05, "loss": 0.5008, "step": 911 }, { "epoch": 1.8852119958634952, "grad_norm": 0.28352903698851994, "learning_rate": 2.0590490797546014e-05, "loss": 0.4924, "step": 912 }, { "epoch": 1.887280248190279, "grad_norm": 0.25866456201310284, "learning_rate": 2.0552147239263807e-05, "loss": 0.5389, "step": 913 }, { "epoch": 1.889348500517063, "grad_norm": 0.30644529078283805, "learning_rate": 2.0513803680981596e-05, "loss": 0.4537, "step": 914 }, { "epoch": 1.8914167528438468, "grad_norm": 0.25639714083779186, "learning_rate": 2.0475460122699388e-05, "loss": 0.5252, "step": 915 }, { "epoch": 1.8934850051706307, "grad_norm": 0.27521360690031654, "learning_rate": 2.043711656441718e-05, "loss": 0.4921, "step": 916 }, { "epoch": 1.8955532574974145, "grad_norm": 0.33452792003252396, "learning_rate": 2.039877300613497e-05, "loss": 0.521, "step": 917 }, { "epoch": 1.8976215098241984, "grad_norm": 0.2731309188862945, "learning_rate": 2.0360429447852762e-05, "loss": 0.547, "step": 918 }, { "epoch": 1.8996897621509823, "grad_norm": 0.3136850008884297, "learning_rate": 2.032208588957055e-05, "loss": 0.4686, "step": 919 }, { "epoch": 1.9017580144777662, "grad_norm": 0.29186362668066246, "learning_rate": 2.0283742331288343e-05, "loss": 0.6024, "step": 920 }, { "epoch": 1.90382626680455, "grad_norm": 0.264049141431274, "learning_rate": 2.0245398773006136e-05, "loss": 0.4856, "step": 921 }, { "epoch": 1.905894519131334, "grad_norm": 0.2692732394294709, "learning_rate": 2.0207055214723928e-05, "loss": 0.5845, "step": 922 }, { "epoch": 1.9079627714581178, "grad_norm": 0.32808568835716406, "learning_rate": 2.016871165644172e-05, "loss": 0.5555, "step": 923 }, { "epoch": 1.9100310237849016, "grad_norm": 0.26845936120516317, "learning_rate": 2.013036809815951e-05, "loss": 0.5107, "step": 924 }, { "epoch": 1.9120992761116855, "grad_norm": 0.28235623088967376, "learning_rate": 2.0092024539877302e-05, "loss": 0.5597, "step": 925 }, { "epoch": 1.9141675284384694, "grad_norm": 0.3049166587139517, "learning_rate": 2.0053680981595094e-05, "loss": 0.5924, "step": 926 }, { "epoch": 1.9162357807652532, "grad_norm": 0.3104739337546623, "learning_rate": 2.0015337423312887e-05, "loss": 0.5246, "step": 927 }, { "epoch": 1.9183040330920371, "grad_norm": 0.2625423178953885, "learning_rate": 1.9976993865030676e-05, "loss": 0.4615, "step": 928 }, { "epoch": 1.920372285418821, "grad_norm": 0.2980320690752966, "learning_rate": 1.9938650306748465e-05, "loss": 0.5011, "step": 929 }, { "epoch": 1.9224405377456049, "grad_norm": 0.3542874418397254, "learning_rate": 1.9900306748466257e-05, "loss": 0.5925, "step": 930 }, { "epoch": 1.9245087900723887, "grad_norm": 0.2927615626946534, "learning_rate": 1.986196319018405e-05, "loss": 0.5594, "step": 931 }, { "epoch": 1.9265770423991726, "grad_norm": 0.34337400574719745, "learning_rate": 1.9823619631901842e-05, "loss": 0.5289, "step": 932 }, { "epoch": 1.9286452947259565, "grad_norm": 0.2674178863338546, "learning_rate": 1.978527607361963e-05, "loss": 0.4651, "step": 933 }, { "epoch": 1.9307135470527403, "grad_norm": 0.2558761079115472, "learning_rate": 1.9746932515337423e-05, "loss": 0.5493, "step": 934 }, { "epoch": 1.9327817993795242, "grad_norm": 0.2610956484472218, "learning_rate": 1.9708588957055216e-05, "loss": 0.4749, "step": 935 }, { "epoch": 1.934850051706308, "grad_norm": 0.2997336747129066, "learning_rate": 1.9670245398773008e-05, "loss": 0.5452, "step": 936 }, { "epoch": 1.936918304033092, "grad_norm": 0.2694297478903577, "learning_rate": 1.96319018404908e-05, "loss": 0.7465, "step": 937 }, { "epoch": 1.9389865563598758, "grad_norm": 6.089618606509764, "learning_rate": 1.959355828220859e-05, "loss": 0.6245, "step": 938 }, { "epoch": 1.9410548086866597, "grad_norm": 0.3476891910159929, "learning_rate": 1.9555214723926382e-05, "loss": 0.5194, "step": 939 }, { "epoch": 1.9431230610134436, "grad_norm": 0.35677077768792215, "learning_rate": 1.9516871165644174e-05, "loss": 0.5234, "step": 940 }, { "epoch": 1.9451913133402274, "grad_norm": 0.34708288846176266, "learning_rate": 1.9478527607361967e-05, "loss": 0.5647, "step": 941 }, { "epoch": 1.9472595656670113, "grad_norm": 0.9600719871698241, "learning_rate": 1.9440184049079756e-05, "loss": 0.432, "step": 942 }, { "epoch": 1.9493278179937952, "grad_norm": 0.30975372172676197, "learning_rate": 1.9401840490797545e-05, "loss": 0.5276, "step": 943 }, { "epoch": 1.951396070320579, "grad_norm": 0.3245352559958314, "learning_rate": 1.9363496932515337e-05, "loss": 0.4956, "step": 944 }, { "epoch": 1.953464322647363, "grad_norm": 0.2951893675374994, "learning_rate": 1.932515337423313e-05, "loss": 0.5368, "step": 945 }, { "epoch": 1.9555325749741468, "grad_norm": 0.34735922799169755, "learning_rate": 1.9286809815950922e-05, "loss": 0.5577, "step": 946 }, { "epoch": 1.9576008273009307, "grad_norm": 0.3183919034233625, "learning_rate": 1.924846625766871e-05, "loss": 0.5231, "step": 947 }, { "epoch": 1.9596690796277145, "grad_norm": 0.2869924923611133, "learning_rate": 1.9210122699386503e-05, "loss": 0.527, "step": 948 }, { "epoch": 1.9617373319544984, "grad_norm": 0.31374693438794854, "learning_rate": 1.9171779141104296e-05, "loss": 0.533, "step": 949 }, { "epoch": 1.9638055842812823, "grad_norm": 0.31844571617823136, "learning_rate": 1.9133435582822088e-05, "loss": 0.5735, "step": 950 }, { "epoch": 1.9658738366080661, "grad_norm": 0.32301158083849846, "learning_rate": 1.909509202453988e-05, "loss": 0.4999, "step": 951 }, { "epoch": 1.96794208893485, "grad_norm": 0.3000600785132947, "learning_rate": 1.905674846625767e-05, "loss": 0.5349, "step": 952 }, { "epoch": 1.9700103412616339, "grad_norm": 0.29402920758425233, "learning_rate": 1.9018404907975462e-05, "loss": 0.5021, "step": 953 }, { "epoch": 1.9720785935884177, "grad_norm": 0.3119011613186792, "learning_rate": 1.898006134969325e-05, "loss": 0.6134, "step": 954 }, { "epoch": 1.9741468459152016, "grad_norm": 0.2792466398520923, "learning_rate": 1.8941717791411043e-05, "loss": 0.5725, "step": 955 }, { "epoch": 1.9762150982419855, "grad_norm": 0.387891681792125, "learning_rate": 1.8903374233128836e-05, "loss": 0.5291, "step": 956 }, { "epoch": 1.9782833505687694, "grad_norm": 0.2850966893772775, "learning_rate": 1.8865030674846625e-05, "loss": 0.5631, "step": 957 }, { "epoch": 1.9803516028955532, "grad_norm": 0.3237650740346515, "learning_rate": 1.8826687116564417e-05, "loss": 0.5547, "step": 958 }, { "epoch": 1.982419855222337, "grad_norm": 0.37605029877968865, "learning_rate": 1.878834355828221e-05, "loss": 0.516, "step": 959 }, { "epoch": 1.984488107549121, "grad_norm": 0.3124448693485492, "learning_rate": 1.8750000000000002e-05, "loss": 0.5581, "step": 960 }, { "epoch": 1.9865563598759048, "grad_norm": 0.25702742084013197, "learning_rate": 1.8711656441717794e-05, "loss": 0.5008, "step": 961 }, { "epoch": 1.9886246122026887, "grad_norm": 0.25381435481389747, "learning_rate": 1.8673312883435583e-05, "loss": 0.5037, "step": 962 }, { "epoch": 1.9906928645294726, "grad_norm": 0.2708622495250703, "learning_rate": 1.8634969325153376e-05, "loss": 0.4507, "step": 963 }, { "epoch": 1.9927611168562565, "grad_norm": 0.2570302015166466, "learning_rate": 1.8596625766871168e-05, "loss": 0.4882, "step": 964 }, { "epoch": 1.9948293691830403, "grad_norm": 0.26659173404555514, "learning_rate": 1.855828220858896e-05, "loss": 0.5718, "step": 965 }, { "epoch": 1.9968976215098242, "grad_norm": 0.32655970108287535, "learning_rate": 1.851993865030675e-05, "loss": 0.55, "step": 966 }, { "epoch": 1.998965873836608, "grad_norm": 0.2720952370098889, "learning_rate": 1.848159509202454e-05, "loss": 0.477, "step": 967 }, { "epoch": 2.0, "grad_norm": 0.431000448202053, "learning_rate": 1.844325153374233e-05, "loss": 0.4745, "step": 968 }, { "epoch": 2.002068252326784, "grad_norm": 0.47629179305706953, "learning_rate": 1.8404907975460123e-05, "loss": 0.4549, "step": 969 }, { "epoch": 2.0041365046535677, "grad_norm": 0.30831367524548703, "learning_rate": 1.8366564417177915e-05, "loss": 0.3918, "step": 970 }, { "epoch": 2.0062047569803516, "grad_norm": 0.3134878784232232, "learning_rate": 1.8328220858895704e-05, "loss": 0.3827, "step": 971 }, { "epoch": 2.0082730093071355, "grad_norm": 0.4570845445008937, "learning_rate": 1.8289877300613497e-05, "loss": 0.4497, "step": 972 }, { "epoch": 2.0103412616339194, "grad_norm": 0.38734953893456137, "learning_rate": 1.825153374233129e-05, "loss": 0.4716, "step": 973 }, { "epoch": 2.0124095139607032, "grad_norm": 0.29908493348758625, "learning_rate": 1.821319018404908e-05, "loss": 0.4112, "step": 974 }, { "epoch": 2.014477766287487, "grad_norm": 0.3877313072445116, "learning_rate": 1.8174846625766874e-05, "loss": 0.4246, "step": 975 }, { "epoch": 2.016546018614271, "grad_norm": 0.3963242698981563, "learning_rate": 1.8136503067484663e-05, "loss": 0.3904, "step": 976 }, { "epoch": 2.018614270941055, "grad_norm": 0.36190254729542776, "learning_rate": 1.8098159509202455e-05, "loss": 0.45, "step": 977 }, { "epoch": 2.0206825232678387, "grad_norm": 0.33706832326394603, "learning_rate": 1.8059815950920248e-05, "loss": 0.4445, "step": 978 }, { "epoch": 2.0227507755946226, "grad_norm": 0.31893681865351103, "learning_rate": 1.8021472392638037e-05, "loss": 0.3881, "step": 979 }, { "epoch": 2.0248190279214064, "grad_norm": 0.3529552134603544, "learning_rate": 1.798312883435583e-05, "loss": 0.4693, "step": 980 }, { "epoch": 2.0268872802481903, "grad_norm": 0.3057424206081247, "learning_rate": 1.7944785276073618e-05, "loss": 0.4036, "step": 981 }, { "epoch": 2.028955532574974, "grad_norm": 0.2775624114732881, "learning_rate": 1.790644171779141e-05, "loss": 0.389, "step": 982 }, { "epoch": 2.031023784901758, "grad_norm": 0.31466325044399024, "learning_rate": 1.7868098159509203e-05, "loss": 0.4377, "step": 983 }, { "epoch": 2.033092037228542, "grad_norm": 0.2842866785792338, "learning_rate": 1.7829754601226995e-05, "loss": 0.4431, "step": 984 }, { "epoch": 2.035160289555326, "grad_norm": 0.29871902143033124, "learning_rate": 1.7791411042944784e-05, "loss": 0.3882, "step": 985 }, { "epoch": 2.0372285418821097, "grad_norm": 0.2610852458085795, "learning_rate": 1.7753067484662577e-05, "loss": 0.387, "step": 986 }, { "epoch": 2.0392967942088935, "grad_norm": 0.27759849679362564, "learning_rate": 1.771472392638037e-05, "loss": 0.4127, "step": 987 }, { "epoch": 2.0413650465356774, "grad_norm": 0.2771796553099112, "learning_rate": 1.767638036809816e-05, "loss": 0.422, "step": 988 }, { "epoch": 2.0434332988624613, "grad_norm": 0.2667538898673085, "learning_rate": 1.7638036809815954e-05, "loss": 0.3931, "step": 989 }, { "epoch": 2.045501551189245, "grad_norm": 0.24249271732860572, "learning_rate": 1.7599693251533743e-05, "loss": 0.4172, "step": 990 }, { "epoch": 2.047569803516029, "grad_norm": 0.22777740528682705, "learning_rate": 1.7561349693251535e-05, "loss": 0.3807, "step": 991 }, { "epoch": 2.049638055842813, "grad_norm": 0.2719652495192794, "learning_rate": 1.7523006134969324e-05, "loss": 0.4093, "step": 992 }, { "epoch": 2.0517063081695968, "grad_norm": 0.2608694845506966, "learning_rate": 1.7484662576687117e-05, "loss": 0.4059, "step": 993 }, { "epoch": 2.0537745604963806, "grad_norm": 0.23137640743458612, "learning_rate": 1.744631901840491e-05, "loss": 0.3833, "step": 994 }, { "epoch": 2.0558428128231645, "grad_norm": 14.050920730376216, "learning_rate": 1.7407975460122698e-05, "loss": 0.8095, "step": 995 }, { "epoch": 2.0579110651499484, "grad_norm": 0.26724420870203536, "learning_rate": 1.736963190184049e-05, "loss": 0.393, "step": 996 }, { "epoch": 2.0599793174767322, "grad_norm": 0.2854665573405461, "learning_rate": 1.7331288343558283e-05, "loss": 0.3961, "step": 997 }, { "epoch": 2.062047569803516, "grad_norm": 0.25537799903795294, "learning_rate": 1.7292944785276075e-05, "loss": 0.3737, "step": 998 }, { "epoch": 2.0641158221303, "grad_norm": 0.25893744311445527, "learning_rate": 1.7254601226993868e-05, "loss": 0.4021, "step": 999 }, { "epoch": 2.066184074457084, "grad_norm": 0.30613106140881974, "learning_rate": 1.7216257668711657e-05, "loss": 0.4729, "step": 1000 }, { "epoch": 2.0682523267838677, "grad_norm": 0.2526776894052428, "learning_rate": 1.717791411042945e-05, "loss": 0.4359, "step": 1001 }, { "epoch": 2.0703205791106516, "grad_norm": 0.2773158972841604, "learning_rate": 1.713957055214724e-05, "loss": 0.4096, "step": 1002 }, { "epoch": 2.0723888314374355, "grad_norm": 0.25429904994920627, "learning_rate": 1.7101226993865034e-05, "loss": 0.3858, "step": 1003 }, { "epoch": 2.0744570837642193, "grad_norm": 0.24311119766928332, "learning_rate": 1.7062883435582823e-05, "loss": 0.421, "step": 1004 }, { "epoch": 2.076525336091003, "grad_norm": 0.24428540937736087, "learning_rate": 1.7024539877300612e-05, "loss": 0.418, "step": 1005 }, { "epoch": 2.078593588417787, "grad_norm": 0.2571040576680998, "learning_rate": 1.6986196319018404e-05, "loss": 0.402, "step": 1006 }, { "epoch": 2.080661840744571, "grad_norm": 0.7743384384320491, "learning_rate": 1.6947852760736197e-05, "loss": 0.4547, "step": 1007 }, { "epoch": 2.082730093071355, "grad_norm": 0.2733830677462806, "learning_rate": 1.690950920245399e-05, "loss": 0.3852, "step": 1008 }, { "epoch": 2.0847983453981387, "grad_norm": 0.2522290375402074, "learning_rate": 1.6871165644171778e-05, "loss": 0.4223, "step": 1009 }, { "epoch": 2.0868665977249226, "grad_norm": 0.2732755384487161, "learning_rate": 1.683282208588957e-05, "loss": 0.4004, "step": 1010 }, { "epoch": 2.0889348500517064, "grad_norm": 0.24307175445063378, "learning_rate": 1.6794478527607363e-05, "loss": 0.3852, "step": 1011 }, { "epoch": 2.0910031023784903, "grad_norm": 0.2747076365181285, "learning_rate": 1.6756134969325155e-05, "loss": 0.4258, "step": 1012 }, { "epoch": 2.093071354705274, "grad_norm": 0.2369452823161007, "learning_rate": 1.6717791411042948e-05, "loss": 0.4092, "step": 1013 }, { "epoch": 2.095139607032058, "grad_norm": 0.29027754593554417, "learning_rate": 1.6679447852760737e-05, "loss": 0.457, "step": 1014 }, { "epoch": 2.097207859358842, "grad_norm": 0.28362060317359533, "learning_rate": 1.664110429447853e-05, "loss": 0.4404, "step": 1015 }, { "epoch": 2.099276111685626, "grad_norm": 0.2740917945440104, "learning_rate": 1.660276073619632e-05, "loss": 0.3775, "step": 1016 }, { "epoch": 2.1013443640124096, "grad_norm": 0.26412036666066246, "learning_rate": 1.656441717791411e-05, "loss": 0.4129, "step": 1017 }, { "epoch": 2.1034126163391935, "grad_norm": 0.24231873395899542, "learning_rate": 1.6526073619631903e-05, "loss": 0.4281, "step": 1018 }, { "epoch": 2.1054808686659774, "grad_norm": 0.24190812048804688, "learning_rate": 1.6487730061349692e-05, "loss": 0.4087, "step": 1019 }, { "epoch": 2.1075491209927613, "grad_norm": 0.25258778751058353, "learning_rate": 1.6449386503067484e-05, "loss": 0.3999, "step": 1020 }, { "epoch": 2.109617373319545, "grad_norm": 0.234590701631443, "learning_rate": 1.6411042944785277e-05, "loss": 0.3583, "step": 1021 }, { "epoch": 2.111685625646329, "grad_norm": 0.241857249535167, "learning_rate": 1.637269938650307e-05, "loss": 0.4022, "step": 1022 }, { "epoch": 2.113753877973113, "grad_norm": 0.2516807687526618, "learning_rate": 1.633435582822086e-05, "loss": 0.4153, "step": 1023 }, { "epoch": 2.1158221302998967, "grad_norm": 0.23899990224421983, "learning_rate": 1.629601226993865e-05, "loss": 0.4054, "step": 1024 }, { "epoch": 2.1178903826266806, "grad_norm": 0.2407519557177942, "learning_rate": 1.6257668711656443e-05, "loss": 0.3995, "step": 1025 }, { "epoch": 2.1199586349534645, "grad_norm": 0.28546564206209335, "learning_rate": 1.6219325153374235e-05, "loss": 0.3912, "step": 1026 }, { "epoch": 2.1220268872802484, "grad_norm": 0.2395485444160295, "learning_rate": 1.6180981595092028e-05, "loss": 0.4073, "step": 1027 }, { "epoch": 2.1240951396070322, "grad_norm": 0.2669572664349683, "learning_rate": 1.6142638036809817e-05, "loss": 0.3626, "step": 1028 }, { "epoch": 2.126163391933816, "grad_norm": 0.284192743010817, "learning_rate": 1.6104294478527606e-05, "loss": 0.421, "step": 1029 }, { "epoch": 2.1282316442606, "grad_norm": 0.2388171166439212, "learning_rate": 1.6065950920245398e-05, "loss": 0.371, "step": 1030 }, { "epoch": 2.130299896587384, "grad_norm": 0.24506918941644346, "learning_rate": 1.602760736196319e-05, "loss": 0.3728, "step": 1031 }, { "epoch": 2.1323681489141677, "grad_norm": 0.28277520264695427, "learning_rate": 1.5989263803680983e-05, "loss": 0.3959, "step": 1032 }, { "epoch": 2.1344364012409516, "grad_norm": 0.26622970037349175, "learning_rate": 1.5950920245398772e-05, "loss": 0.3918, "step": 1033 }, { "epoch": 2.1365046535677354, "grad_norm": 0.24068061155269735, "learning_rate": 1.5912576687116564e-05, "loss": 0.4067, "step": 1034 }, { "epoch": 2.1385729058945193, "grad_norm": 0.24710983919176413, "learning_rate": 1.5874233128834357e-05, "loss": 0.3955, "step": 1035 }, { "epoch": 2.140641158221303, "grad_norm": 0.2540507658610337, "learning_rate": 1.583588957055215e-05, "loss": 0.402, "step": 1036 }, { "epoch": 2.142709410548087, "grad_norm": 0.2657365718518537, "learning_rate": 1.579754601226994e-05, "loss": 0.4213, "step": 1037 }, { "epoch": 2.144777662874871, "grad_norm": 0.25509697886808813, "learning_rate": 1.575920245398773e-05, "loss": 0.3511, "step": 1038 }, { "epoch": 2.146845915201655, "grad_norm": 0.28939373323955836, "learning_rate": 1.5720858895705523e-05, "loss": 0.4441, "step": 1039 }, { "epoch": 2.1489141675284387, "grad_norm": 0.27363573125584284, "learning_rate": 1.5682515337423315e-05, "loss": 0.4283, "step": 1040 }, { "epoch": 2.1509824198552225, "grad_norm": 0.23999451024649532, "learning_rate": 1.5644171779141108e-05, "loss": 0.3555, "step": 1041 }, { "epoch": 2.1530506721820064, "grad_norm": 0.2734196176711865, "learning_rate": 1.5605828220858897e-05, "loss": 0.4144, "step": 1042 }, { "epoch": 2.1551189245087903, "grad_norm": 0.25918307646551864, "learning_rate": 1.5567484662576686e-05, "loss": 0.4425, "step": 1043 }, { "epoch": 2.157187176835574, "grad_norm": 0.2476889038604224, "learning_rate": 1.5529141104294478e-05, "loss": 0.4291, "step": 1044 }, { "epoch": 2.159255429162358, "grad_norm": 0.2429993467831973, "learning_rate": 1.549079754601227e-05, "loss": 0.3877, "step": 1045 }, { "epoch": 2.161323681489142, "grad_norm": 0.23782378268394525, "learning_rate": 1.5452453987730063e-05, "loss": 0.3803, "step": 1046 }, { "epoch": 2.1633919338159258, "grad_norm": 0.2627938163442745, "learning_rate": 1.5414110429447852e-05, "loss": 0.4261, "step": 1047 }, { "epoch": 2.1654601861427096, "grad_norm": 0.2578671943879321, "learning_rate": 1.5375766871165644e-05, "loss": 0.3883, "step": 1048 }, { "epoch": 2.1675284384694935, "grad_norm": 0.24343362300631965, "learning_rate": 1.5337423312883436e-05, "loss": 0.3983, "step": 1049 }, { "epoch": 2.1695966907962774, "grad_norm": 0.2614318151219951, "learning_rate": 1.529907975460123e-05, "loss": 0.4365, "step": 1050 }, { "epoch": 2.1716649431230612, "grad_norm": 0.2764018748170909, "learning_rate": 1.526073619631902e-05, "loss": 0.3972, "step": 1051 }, { "epoch": 2.173733195449845, "grad_norm": 0.2856884574334619, "learning_rate": 1.5222392638036812e-05, "loss": 0.4176, "step": 1052 }, { "epoch": 2.175801447776629, "grad_norm": 0.28485346746106693, "learning_rate": 1.5184049079754603e-05, "loss": 0.4673, "step": 1053 }, { "epoch": 2.1778697001034124, "grad_norm": 0.3169563738520476, "learning_rate": 1.5145705521472395e-05, "loss": 0.4829, "step": 1054 }, { "epoch": 2.1799379524301963, "grad_norm": 0.27265565953679427, "learning_rate": 1.5107361963190184e-05, "loss": 0.3871, "step": 1055 }, { "epoch": 2.18200620475698, "grad_norm": 0.30637400192607606, "learning_rate": 1.5069018404907975e-05, "loss": 0.4268, "step": 1056 }, { "epoch": 2.184074457083764, "grad_norm": 0.2649188285628854, "learning_rate": 1.5030674846625767e-05, "loss": 0.4179, "step": 1057 }, { "epoch": 2.186142709410548, "grad_norm": 0.2951384210241639, "learning_rate": 1.4992331288343558e-05, "loss": 0.3933, "step": 1058 }, { "epoch": 2.1882109617373318, "grad_norm": 0.24603556197291945, "learning_rate": 1.495398773006135e-05, "loss": 0.3947, "step": 1059 }, { "epoch": 2.1902792140641156, "grad_norm": 0.2560550608209738, "learning_rate": 1.4915644171779141e-05, "loss": 0.4138, "step": 1060 }, { "epoch": 2.1923474663908995, "grad_norm": 0.24797184523320917, "learning_rate": 1.4877300613496933e-05, "loss": 0.4095, "step": 1061 }, { "epoch": 2.1944157187176834, "grad_norm": 0.27417586377492764, "learning_rate": 1.4838957055214726e-05, "loss": 0.4694, "step": 1062 }, { "epoch": 2.1964839710444672, "grad_norm": 0.23440770829688165, "learning_rate": 1.4800613496932516e-05, "loss": 0.4194, "step": 1063 }, { "epoch": 2.198552223371251, "grad_norm": 0.2921326795333639, "learning_rate": 1.4762269938650309e-05, "loss": 0.4871, "step": 1064 }, { "epoch": 2.200620475698035, "grad_norm": 0.2704790139020965, "learning_rate": 1.47239263803681e-05, "loss": 0.4206, "step": 1065 }, { "epoch": 2.202688728024819, "grad_norm": 0.24411520226428363, "learning_rate": 1.4685582822085892e-05, "loss": 0.4318, "step": 1066 }, { "epoch": 2.2047569803516027, "grad_norm": 0.25437560008138843, "learning_rate": 1.4647239263803681e-05, "loss": 0.4361, "step": 1067 }, { "epoch": 2.2068252326783866, "grad_norm": 0.2415089561497329, "learning_rate": 1.4608895705521472e-05, "loss": 0.41, "step": 1068 }, { "epoch": 2.2088934850051705, "grad_norm": 0.24286589136351697, "learning_rate": 1.4570552147239264e-05, "loss": 0.4152, "step": 1069 }, { "epoch": 2.2109617373319543, "grad_norm": 0.26849653437103355, "learning_rate": 1.4532208588957055e-05, "loss": 0.4158, "step": 1070 }, { "epoch": 2.213029989658738, "grad_norm": 0.24400501570415445, "learning_rate": 1.4493865030674847e-05, "loss": 0.3954, "step": 1071 }, { "epoch": 2.215098241985522, "grad_norm": 0.25921102334772883, "learning_rate": 1.4455521472392638e-05, "loss": 0.4228, "step": 1072 }, { "epoch": 2.217166494312306, "grad_norm": 0.2462793840518496, "learning_rate": 1.441717791411043e-05, "loss": 0.4141, "step": 1073 }, { "epoch": 2.21923474663909, "grad_norm": 0.23706271372877769, "learning_rate": 1.4378834355828223e-05, "loss": 0.377, "step": 1074 }, { "epoch": 2.2213029989658737, "grad_norm": 0.2751707936806633, "learning_rate": 1.4340490797546013e-05, "loss": 0.4398, "step": 1075 }, { "epoch": 2.2233712512926576, "grad_norm": 0.25578795622693073, "learning_rate": 1.4302147239263806e-05, "loss": 0.4198, "step": 1076 }, { "epoch": 2.2254395036194414, "grad_norm": 0.25262590650341066, "learning_rate": 1.4263803680981596e-05, "loss": 0.3706, "step": 1077 }, { "epoch": 2.2275077559462253, "grad_norm": 0.2368339750145138, "learning_rate": 1.4225460122699389e-05, "loss": 0.3814, "step": 1078 }, { "epoch": 2.229576008273009, "grad_norm": 0.29426199780868423, "learning_rate": 1.418711656441718e-05, "loss": 0.4748, "step": 1079 }, { "epoch": 2.231644260599793, "grad_norm": 0.25178275554378277, "learning_rate": 1.4148773006134968e-05, "loss": 0.4503, "step": 1080 }, { "epoch": 2.233712512926577, "grad_norm": 0.2410902541145751, "learning_rate": 1.411042944785276e-05, "loss": 0.4214, "step": 1081 }, { "epoch": 2.235780765253361, "grad_norm": 0.24477364090528905, "learning_rate": 1.4072085889570552e-05, "loss": 0.4452, "step": 1082 }, { "epoch": 2.2378490175801447, "grad_norm": 0.23843938694651437, "learning_rate": 1.4033742331288344e-05, "loss": 0.3773, "step": 1083 }, { "epoch": 2.2399172699069285, "grad_norm": 0.23517105580228817, "learning_rate": 1.3995398773006135e-05, "loss": 0.3687, "step": 1084 }, { "epoch": 2.2419855222337124, "grad_norm": 0.22791364919064683, "learning_rate": 1.3957055214723927e-05, "loss": 0.3876, "step": 1085 }, { "epoch": 2.2440537745604963, "grad_norm": 0.25732529821418765, "learning_rate": 1.391871165644172e-05, "loss": 0.3828, "step": 1086 }, { "epoch": 2.24612202688728, "grad_norm": 0.29356679938106167, "learning_rate": 1.388036809815951e-05, "loss": 0.47, "step": 1087 }, { "epoch": 2.248190279214064, "grad_norm": 0.24905399961506394, "learning_rate": 1.3842024539877302e-05, "loss": 0.4123, "step": 1088 }, { "epoch": 2.250258531540848, "grad_norm": 0.2565007215271316, "learning_rate": 1.3803680981595093e-05, "loss": 0.4083, "step": 1089 }, { "epoch": 2.2523267838676317, "grad_norm": 2.325722490379455, "learning_rate": 1.3765337423312886e-05, "loss": 0.4001, "step": 1090 }, { "epoch": 2.2543950361944156, "grad_norm": 0.23381977734583748, "learning_rate": 1.3726993865030676e-05, "loss": 0.4013, "step": 1091 }, { "epoch": 2.2564632885211995, "grad_norm": 0.2559163736755959, "learning_rate": 1.3688650306748465e-05, "loss": 0.3948, "step": 1092 }, { "epoch": 2.2585315408479834, "grad_norm": 0.3390442557179547, "learning_rate": 1.3650306748466258e-05, "loss": 0.4318, "step": 1093 }, { "epoch": 2.2605997931747672, "grad_norm": 0.2504607904708446, "learning_rate": 1.3611963190184048e-05, "loss": 0.4042, "step": 1094 }, { "epoch": 2.262668045501551, "grad_norm": 0.24973227255016298, "learning_rate": 1.357361963190184e-05, "loss": 0.351, "step": 1095 }, { "epoch": 2.264736297828335, "grad_norm": 0.2570760426563545, "learning_rate": 1.3535276073619631e-05, "loss": 0.3868, "step": 1096 }, { "epoch": 2.266804550155119, "grad_norm": 0.24055479598084764, "learning_rate": 1.3496932515337424e-05, "loss": 0.4, "step": 1097 }, { "epoch": 2.2688728024819027, "grad_norm": 0.25921211827722074, "learning_rate": 1.3458588957055216e-05, "loss": 0.3863, "step": 1098 }, { "epoch": 2.2709410548086866, "grad_norm": 0.267049844387904, "learning_rate": 1.3420245398773007e-05, "loss": 0.4442, "step": 1099 }, { "epoch": 2.2730093071354704, "grad_norm": 0.2479000820065534, "learning_rate": 1.33819018404908e-05, "loss": 0.3843, "step": 1100 }, { "epoch": 2.2750775594622543, "grad_norm": 0.2822817840774448, "learning_rate": 1.334355828220859e-05, "loss": 0.4318, "step": 1101 }, { "epoch": 2.277145811789038, "grad_norm": 0.2525805956236751, "learning_rate": 1.3305214723926382e-05, "loss": 0.4406, "step": 1102 }, { "epoch": 2.279214064115822, "grad_norm": 0.26202277557173825, "learning_rate": 1.3266871165644173e-05, "loss": 0.4248, "step": 1103 }, { "epoch": 2.281282316442606, "grad_norm": 0.29017392021452904, "learning_rate": 1.3228527607361965e-05, "loss": 0.4256, "step": 1104 }, { "epoch": 2.28335056876939, "grad_norm": 0.2507987934146511, "learning_rate": 1.3190184049079754e-05, "loss": 0.4186, "step": 1105 }, { "epoch": 2.2854188210961737, "grad_norm": 1.6555656723053405, "learning_rate": 1.3151840490797545e-05, "loss": 0.452, "step": 1106 }, { "epoch": 2.2874870734229575, "grad_norm": 1.8304851695428117, "learning_rate": 1.3113496932515338e-05, "loss": 0.4277, "step": 1107 }, { "epoch": 2.2895553257497414, "grad_norm": 0.35022581752974485, "learning_rate": 1.3075153374233128e-05, "loss": 0.4714, "step": 1108 }, { "epoch": 2.2916235780765253, "grad_norm": 0.29341426330954923, "learning_rate": 1.303680981595092e-05, "loss": 0.4151, "step": 1109 }, { "epoch": 2.293691830403309, "grad_norm": 0.24276611767954465, "learning_rate": 1.2998466257668711e-05, "loss": 0.3687, "step": 1110 }, { "epoch": 2.295760082730093, "grad_norm": 0.2535380533978877, "learning_rate": 1.2960122699386504e-05, "loss": 0.3696, "step": 1111 }, { "epoch": 2.297828335056877, "grad_norm": 0.3089135338217103, "learning_rate": 1.2921779141104296e-05, "loss": 0.4414, "step": 1112 }, { "epoch": 2.2998965873836608, "grad_norm": 0.2673819863402386, "learning_rate": 1.2883435582822087e-05, "loss": 0.4694, "step": 1113 }, { "epoch": 2.3019648397104446, "grad_norm": 0.48697896452386363, "learning_rate": 1.284509202453988e-05, "loss": 0.4157, "step": 1114 }, { "epoch": 2.3040330920372285, "grad_norm": 0.3047807891591508, "learning_rate": 1.280674846625767e-05, "loss": 0.4245, "step": 1115 }, { "epoch": 2.3061013443640124, "grad_norm": 0.2510789572490757, "learning_rate": 1.2768404907975462e-05, "loss": 0.4667, "step": 1116 }, { "epoch": 2.3081695966907962, "grad_norm": 0.2490343343384334, "learning_rate": 1.2730061349693251e-05, "loss": 0.3629, "step": 1117 }, { "epoch": 2.31023784901758, "grad_norm": 0.2492768785022701, "learning_rate": 1.2691717791411042e-05, "loss": 0.4115, "step": 1118 }, { "epoch": 2.312306101344364, "grad_norm": 0.25050248376663975, "learning_rate": 1.2653374233128834e-05, "loss": 0.4587, "step": 1119 }, { "epoch": 2.314374353671148, "grad_norm": 0.2542384993728143, "learning_rate": 1.2615030674846625e-05, "loss": 0.4796, "step": 1120 }, { "epoch": 2.3164426059979317, "grad_norm": 0.24612855456289723, "learning_rate": 1.2576687116564418e-05, "loss": 0.4135, "step": 1121 }, { "epoch": 2.3185108583247156, "grad_norm": 0.25880248831769154, "learning_rate": 1.2538343558282208e-05, "loss": 0.4383, "step": 1122 }, { "epoch": 2.3205791106514995, "grad_norm": 0.2502203971029847, "learning_rate": 1.25e-05, "loss": 0.4682, "step": 1123 }, { "epoch": 2.3226473629782833, "grad_norm": 0.2549428671754507, "learning_rate": 1.2461656441717793e-05, "loss": 0.4312, "step": 1124 }, { "epoch": 2.324715615305067, "grad_norm": 0.24888554684645747, "learning_rate": 1.2423312883435584e-05, "loss": 0.4434, "step": 1125 }, { "epoch": 2.326783867631851, "grad_norm": 0.24271793981919457, "learning_rate": 1.2384969325153376e-05, "loss": 0.3995, "step": 1126 }, { "epoch": 2.328852119958635, "grad_norm": 0.23058471372928252, "learning_rate": 1.2346625766871165e-05, "loss": 0.4084, "step": 1127 }, { "epoch": 2.330920372285419, "grad_norm": 0.2363529848251955, "learning_rate": 1.2308282208588957e-05, "loss": 0.3894, "step": 1128 }, { "epoch": 2.3329886246122027, "grad_norm": 0.23193888316755681, "learning_rate": 1.2269938650306748e-05, "loss": 0.4133, "step": 1129 }, { "epoch": 2.3350568769389866, "grad_norm": 0.25738004529636027, "learning_rate": 1.223159509202454e-05, "loss": 0.4289, "step": 1130 }, { "epoch": 2.3371251292657704, "grad_norm": 0.2232561469209977, "learning_rate": 1.2193251533742333e-05, "loss": 0.3568, "step": 1131 }, { "epoch": 2.3391933815925543, "grad_norm": 0.26272347007571134, "learning_rate": 1.2154907975460124e-05, "loss": 0.4287, "step": 1132 }, { "epoch": 2.341261633919338, "grad_norm": 0.2717844077354033, "learning_rate": 1.2116564417177914e-05, "loss": 0.4503, "step": 1133 }, { "epoch": 2.343329886246122, "grad_norm": 0.223428577368023, "learning_rate": 1.2078220858895705e-05, "loss": 0.3706, "step": 1134 }, { "epoch": 2.345398138572906, "grad_norm": 0.23694155511724851, "learning_rate": 1.2039877300613497e-05, "loss": 0.3585, "step": 1135 }, { "epoch": 2.34746639089969, "grad_norm": 0.2484364425636473, "learning_rate": 1.200153374233129e-05, "loss": 0.4151, "step": 1136 }, { "epoch": 2.3495346432264737, "grad_norm": 0.3177778598026714, "learning_rate": 1.196319018404908e-05, "loss": 0.4196, "step": 1137 }, { "epoch": 2.3516028955532575, "grad_norm": 0.25493741773153283, "learning_rate": 1.1924846625766873e-05, "loss": 0.4483, "step": 1138 }, { "epoch": 2.3536711478800414, "grad_norm": 0.24691710971370717, "learning_rate": 1.1886503067484662e-05, "loss": 0.4295, "step": 1139 }, { "epoch": 2.3557394002068253, "grad_norm": 0.2306195091196214, "learning_rate": 1.1848159509202454e-05, "loss": 0.3687, "step": 1140 }, { "epoch": 2.357807652533609, "grad_norm": 0.21984248826842162, "learning_rate": 1.1809815950920245e-05, "loss": 0.3945, "step": 1141 }, { "epoch": 2.359875904860393, "grad_norm": 0.2148632351051964, "learning_rate": 1.1771472392638037e-05, "loss": 0.3142, "step": 1142 }, { "epoch": 2.361944157187177, "grad_norm": 0.260179408056303, "learning_rate": 1.173312883435583e-05, "loss": 0.4309, "step": 1143 }, { "epoch": 2.3640124095139607, "grad_norm": 0.24267868480582117, "learning_rate": 1.169478527607362e-05, "loss": 0.4028, "step": 1144 }, { "epoch": 2.3660806618407446, "grad_norm": 0.44206332454625163, "learning_rate": 1.1656441717791411e-05, "loss": 0.4095, "step": 1145 }, { "epoch": 2.3681489141675285, "grad_norm": 0.2635140979796242, "learning_rate": 1.1618098159509202e-05, "loss": 0.4215, "step": 1146 }, { "epoch": 2.3702171664943124, "grad_norm": 0.598111407648202, "learning_rate": 1.1579754601226994e-05, "loss": 0.3878, "step": 1147 }, { "epoch": 2.3722854188210962, "grad_norm": 0.2594053451420084, "learning_rate": 1.1541411042944785e-05, "loss": 0.3769, "step": 1148 }, { "epoch": 2.37435367114788, "grad_norm": 0.30153438009619216, "learning_rate": 1.1503067484662577e-05, "loss": 0.3735, "step": 1149 }, { "epoch": 2.376421923474664, "grad_norm": 0.24347987837386934, "learning_rate": 1.146472392638037e-05, "loss": 0.38, "step": 1150 }, { "epoch": 2.378490175801448, "grad_norm": 0.23625102891477248, "learning_rate": 1.142638036809816e-05, "loss": 0.3816, "step": 1151 }, { "epoch": 2.3805584281282317, "grad_norm": 0.7815591614712002, "learning_rate": 1.1388036809815951e-05, "loss": 0.3817, "step": 1152 }, { "epoch": 2.3826266804550156, "grad_norm": 0.24563862588427346, "learning_rate": 1.1349693251533742e-05, "loss": 0.3676, "step": 1153 }, { "epoch": 2.3846949327817994, "grad_norm": 0.225874200994016, "learning_rate": 1.1311349693251534e-05, "loss": 0.3822, "step": 1154 }, { "epoch": 2.3867631851085833, "grad_norm": 0.23954702131507924, "learning_rate": 1.1273006134969327e-05, "loss": 0.4052, "step": 1155 }, { "epoch": 2.388831437435367, "grad_norm": 0.2628565827888718, "learning_rate": 1.1234662576687117e-05, "loss": 0.4486, "step": 1156 }, { "epoch": 2.390899689762151, "grad_norm": 0.238207307145341, "learning_rate": 1.119631901840491e-05, "loss": 0.4215, "step": 1157 }, { "epoch": 2.392967942088935, "grad_norm": 0.2522672149349201, "learning_rate": 1.1157975460122699e-05, "loss": 0.447, "step": 1158 }, { "epoch": 2.395036194415719, "grad_norm": 0.24312128193867902, "learning_rate": 1.1119631901840491e-05, "loss": 0.4322, "step": 1159 }, { "epoch": 2.3971044467425027, "grad_norm": 0.2622762311018332, "learning_rate": 1.1081288343558282e-05, "loss": 0.4077, "step": 1160 }, { "epoch": 2.3991726990692865, "grad_norm": 0.2419301096741495, "learning_rate": 1.1042944785276074e-05, "loss": 0.4492, "step": 1161 }, { "epoch": 2.4012409513960704, "grad_norm": 0.2790019120781894, "learning_rate": 1.1004601226993867e-05, "loss": 0.4169, "step": 1162 }, { "epoch": 2.4033092037228543, "grad_norm": 3.3624375516209075, "learning_rate": 1.0966257668711657e-05, "loss": 0.3813, "step": 1163 }, { "epoch": 2.405377456049638, "grad_norm": 0.2358878946785627, "learning_rate": 1.0927914110429448e-05, "loss": 0.3821, "step": 1164 }, { "epoch": 2.407445708376422, "grad_norm": 0.23933691165739135, "learning_rate": 1.0889570552147239e-05, "loss": 0.4154, "step": 1165 }, { "epoch": 2.409513960703206, "grad_norm": 1.9189712883302865, "learning_rate": 1.0851226993865031e-05, "loss": 0.5331, "step": 1166 }, { "epoch": 2.4115822130299898, "grad_norm": 0.2776792733141182, "learning_rate": 1.0812883435582823e-05, "loss": 0.4203, "step": 1167 }, { "epoch": 2.4136504653567736, "grad_norm": 0.45533559014031705, "learning_rate": 1.0774539877300614e-05, "loss": 0.4038, "step": 1168 }, { "epoch": 2.4157187176835575, "grad_norm": 0.23542465955110067, "learning_rate": 1.0736196319018407e-05, "loss": 0.3746, "step": 1169 }, { "epoch": 2.4177869700103414, "grad_norm": 0.2420512359977709, "learning_rate": 1.0697852760736197e-05, "loss": 0.367, "step": 1170 }, { "epoch": 2.4198552223371252, "grad_norm": 0.2773164133195411, "learning_rate": 1.0659509202453988e-05, "loss": 0.3907, "step": 1171 }, { "epoch": 2.421923474663909, "grad_norm": 0.2564325491252431, "learning_rate": 1.0621165644171779e-05, "loss": 0.4288, "step": 1172 }, { "epoch": 2.423991726990693, "grad_norm": 0.25093118172080303, "learning_rate": 1.0582822085889571e-05, "loss": 0.392, "step": 1173 }, { "epoch": 2.426059979317477, "grad_norm": 0.24189388258794267, "learning_rate": 1.0544478527607363e-05, "loss": 0.3592, "step": 1174 }, { "epoch": 2.4281282316442607, "grad_norm": 0.2472848079952485, "learning_rate": 1.0506134969325154e-05, "loss": 0.3983, "step": 1175 }, { "epoch": 2.4301964839710446, "grad_norm": 0.26410503127935064, "learning_rate": 1.0467791411042947e-05, "loss": 0.4137, "step": 1176 }, { "epoch": 2.4322647362978285, "grad_norm": 0.910454530797915, "learning_rate": 1.0429447852760736e-05, "loss": 0.42, "step": 1177 }, { "epoch": 2.4343329886246123, "grad_norm": 0.23879568977707782, "learning_rate": 1.0391104294478528e-05, "loss": 0.3742, "step": 1178 }, { "epoch": 2.436401240951396, "grad_norm": 0.2673059809534077, "learning_rate": 1.0352760736196319e-05, "loss": 0.3567, "step": 1179 }, { "epoch": 2.43846949327818, "grad_norm": 0.24719446525710712, "learning_rate": 1.0314417177914111e-05, "loss": 0.3717, "step": 1180 }, { "epoch": 2.440537745604964, "grad_norm": 0.26393132426902893, "learning_rate": 1.0276073619631903e-05, "loss": 0.3972, "step": 1181 }, { "epoch": 2.442605997931748, "grad_norm": 0.2501041913256955, "learning_rate": 1.0237730061349694e-05, "loss": 0.4107, "step": 1182 }, { "epoch": 2.4446742502585317, "grad_norm": 0.23597112570488105, "learning_rate": 1.0199386503067485e-05, "loss": 0.4283, "step": 1183 }, { "epoch": 2.4467425025853156, "grad_norm": 0.27103431070261663, "learning_rate": 1.0161042944785275e-05, "loss": 0.4694, "step": 1184 }, { "epoch": 2.4488107549120994, "grad_norm": 0.24892880852787402, "learning_rate": 1.0122699386503068e-05, "loss": 0.422, "step": 1185 }, { "epoch": 2.4508790072388833, "grad_norm": 0.2483139862909167, "learning_rate": 1.008435582822086e-05, "loss": 0.4556, "step": 1186 }, { "epoch": 2.452947259565667, "grad_norm": 0.2281475976478797, "learning_rate": 1.0046012269938651e-05, "loss": 0.3865, "step": 1187 }, { "epoch": 2.455015511892451, "grad_norm": 0.2503223144116202, "learning_rate": 1.0007668711656443e-05, "loss": 0.4037, "step": 1188 }, { "epoch": 2.457083764219235, "grad_norm": 0.24135676195711964, "learning_rate": 9.969325153374232e-06, "loss": 0.4144, "step": 1189 }, { "epoch": 2.459152016546019, "grad_norm": 0.24594870223065224, "learning_rate": 9.930981595092025e-06, "loss": 0.4457, "step": 1190 }, { "epoch": 2.4612202688728027, "grad_norm": 0.24307696189745595, "learning_rate": 9.892638036809815e-06, "loss": 0.4364, "step": 1191 }, { "epoch": 2.4632885211995865, "grad_norm": 0.2489247890063853, "learning_rate": 9.854294478527608e-06, "loss": 0.4237, "step": 1192 }, { "epoch": 2.4653567735263704, "grad_norm": 0.2418248819993506, "learning_rate": 9.8159509202454e-06, "loss": 0.3713, "step": 1193 }, { "epoch": 2.4674250258531543, "grad_norm": 0.23653367452672705, "learning_rate": 9.777607361963191e-06, "loss": 0.4233, "step": 1194 }, { "epoch": 2.469493278179938, "grad_norm": 0.2556562963691753, "learning_rate": 9.739263803680983e-06, "loss": 0.4511, "step": 1195 }, { "epoch": 2.471561530506722, "grad_norm": 0.2276735705375584, "learning_rate": 9.700920245398772e-06, "loss": 0.3729, "step": 1196 }, { "epoch": 2.473629782833506, "grad_norm": 0.26604128628552165, "learning_rate": 9.662576687116565e-06, "loss": 0.4607, "step": 1197 }, { "epoch": 2.4756980351602897, "grad_norm": 0.2210397457846277, "learning_rate": 9.624233128834355e-06, "loss": 0.3862, "step": 1198 }, { "epoch": 2.4777662874870736, "grad_norm": 0.2685607529369599, "learning_rate": 9.585889570552148e-06, "loss": 0.4174, "step": 1199 }, { "epoch": 2.479834539813857, "grad_norm": 0.5921463498888342, "learning_rate": 9.54754601226994e-06, "loss": 0.5136, "step": 1200 }, { "epoch": 2.481902792140641, "grad_norm": 0.25563723069887784, "learning_rate": 9.509202453987731e-06, "loss": 0.4186, "step": 1201 }, { "epoch": 2.483971044467425, "grad_norm": 0.23449417498762165, "learning_rate": 9.470858895705522e-06, "loss": 0.4163, "step": 1202 }, { "epoch": 2.4860392967942087, "grad_norm": 0.24934177803567742, "learning_rate": 9.432515337423312e-06, "loss": 0.4091, "step": 1203 }, { "epoch": 2.4881075491209925, "grad_norm": 0.24652577199501838, "learning_rate": 9.394171779141105e-06, "loss": 0.437, "step": 1204 }, { "epoch": 2.4901758014477764, "grad_norm": 0.26272943111408364, "learning_rate": 9.355828220858897e-06, "loss": 0.4809, "step": 1205 }, { "epoch": 2.4922440537745603, "grad_norm": 0.24666765002129615, "learning_rate": 9.317484662576688e-06, "loss": 0.3962, "step": 1206 }, { "epoch": 2.494312306101344, "grad_norm": 0.23853274013165168, "learning_rate": 9.27914110429448e-06, "loss": 0.429, "step": 1207 }, { "epoch": 2.496380558428128, "grad_norm": 0.2422978413861295, "learning_rate": 9.24079754601227e-06, "loss": 0.4466, "step": 1208 }, { "epoch": 2.498448810754912, "grad_norm": 0.2358464563312213, "learning_rate": 9.202453987730062e-06, "loss": 0.4378, "step": 1209 }, { "epoch": 2.5005170630816957, "grad_norm": 0.25763963114557403, "learning_rate": 9.164110429447852e-06, "loss": 0.4645, "step": 1210 }, { "epoch": 2.5025853154084796, "grad_norm": 0.2829969356796723, "learning_rate": 9.125766871165645e-06, "loss": 0.4458, "step": 1211 }, { "epoch": 2.5046535677352635, "grad_norm": 0.25117053245880244, "learning_rate": 9.087423312883437e-06, "loss": 0.4246, "step": 1212 }, { "epoch": 2.5067218200620474, "grad_norm": 0.2454304764160962, "learning_rate": 9.049079754601228e-06, "loss": 0.4586, "step": 1213 }, { "epoch": 2.5087900723888312, "grad_norm": 0.2354487515615165, "learning_rate": 9.010736196319018e-06, "loss": 0.3851, "step": 1214 }, { "epoch": 2.510858324715615, "grad_norm": 0.2754272378079927, "learning_rate": 8.972392638036809e-06, "loss": 0.4115, "step": 1215 }, { "epoch": 2.512926577042399, "grad_norm": 0.2432120200644478, "learning_rate": 8.934049079754602e-06, "loss": 0.3602, "step": 1216 }, { "epoch": 2.514994829369183, "grad_norm": 0.26592998149305136, "learning_rate": 8.895705521472392e-06, "loss": 0.4259, "step": 1217 }, { "epoch": 2.5170630816959667, "grad_norm": 0.25055653139844697, "learning_rate": 8.857361963190185e-06, "loss": 0.4542, "step": 1218 }, { "epoch": 2.5191313340227506, "grad_norm": 0.24139343023575258, "learning_rate": 8.819018404907977e-06, "loss": 0.3522, "step": 1219 }, { "epoch": 2.5211995863495344, "grad_norm": 0.25749643871552047, "learning_rate": 8.780674846625768e-06, "loss": 0.3805, "step": 1220 }, { "epoch": 2.5232678386763183, "grad_norm": 0.23803606375736044, "learning_rate": 8.742331288343558e-06, "loss": 0.403, "step": 1221 }, { "epoch": 2.525336091003102, "grad_norm": 0.2625320425949623, "learning_rate": 8.703987730061349e-06, "loss": 0.4554, "step": 1222 }, { "epoch": 2.527404343329886, "grad_norm": 0.23054572085671735, "learning_rate": 8.665644171779141e-06, "loss": 0.3963, "step": 1223 }, { "epoch": 2.52947259565667, "grad_norm": 0.23955223797925965, "learning_rate": 8.627300613496934e-06, "loss": 0.4493, "step": 1224 }, { "epoch": 2.531540847983454, "grad_norm": 0.23598405903273562, "learning_rate": 8.588957055214725e-06, "loss": 0.413, "step": 1225 }, { "epoch": 2.5336091003102377, "grad_norm": 0.23123649785788816, "learning_rate": 8.550613496932517e-06, "loss": 0.3918, "step": 1226 }, { "epoch": 2.5356773526370215, "grad_norm": 0.268629460994331, "learning_rate": 8.512269938650306e-06, "loss": 0.4647, "step": 1227 }, { "epoch": 2.5377456049638054, "grad_norm": 0.25739789101914284, "learning_rate": 8.473926380368098e-06, "loss": 0.447, "step": 1228 }, { "epoch": 2.5398138572905893, "grad_norm": 0.23983441102583017, "learning_rate": 8.435582822085889e-06, "loss": 0.4396, "step": 1229 }, { "epoch": 2.541882109617373, "grad_norm": 0.2535915495342421, "learning_rate": 8.397239263803681e-06, "loss": 0.4469, "step": 1230 }, { "epoch": 2.543950361944157, "grad_norm": 0.2317425368378893, "learning_rate": 8.358895705521474e-06, "loss": 0.3974, "step": 1231 }, { "epoch": 2.546018614270941, "grad_norm": 0.23863224012500925, "learning_rate": 8.320552147239265e-06, "loss": 0.4257, "step": 1232 }, { "epoch": 2.5480868665977248, "grad_norm": 0.2232001546832105, "learning_rate": 8.282208588957055e-06, "loss": 0.3819, "step": 1233 }, { "epoch": 2.5501551189245086, "grad_norm": 0.2386711303663504, "learning_rate": 8.243865030674846e-06, "loss": 0.4304, "step": 1234 }, { "epoch": 2.5522233712512925, "grad_norm": 0.2533090751997401, "learning_rate": 8.205521472392638e-06, "loss": 0.4506, "step": 1235 }, { "epoch": 2.5542916235780764, "grad_norm": 0.22112729477728466, "learning_rate": 8.16717791411043e-06, "loss": 0.3633, "step": 1236 }, { "epoch": 2.5563598759048602, "grad_norm": 0.24343235448389464, "learning_rate": 8.128834355828221e-06, "loss": 0.4079, "step": 1237 }, { "epoch": 2.558428128231644, "grad_norm": 0.21672192806399532, "learning_rate": 8.090490797546014e-06, "loss": 0.3651, "step": 1238 }, { "epoch": 2.560496380558428, "grad_norm": 7.097529603420055, "learning_rate": 8.052147239263803e-06, "loss": 0.8225, "step": 1239 }, { "epoch": 2.562564632885212, "grad_norm": 0.2573696463239592, "learning_rate": 8.013803680981595e-06, "loss": 0.3976, "step": 1240 }, { "epoch": 2.5646328852119957, "grad_norm": 0.22318021753248032, "learning_rate": 7.975460122699386e-06, "loss": 0.3711, "step": 1241 }, { "epoch": 2.5667011375387796, "grad_norm": 0.24905335262926967, "learning_rate": 7.937116564417178e-06, "loss": 0.3999, "step": 1242 }, { "epoch": 2.5687693898655635, "grad_norm": 0.23889063360142568, "learning_rate": 7.89877300613497e-06, "loss": 0.4089, "step": 1243 }, { "epoch": 2.5708376421923473, "grad_norm": 1.6493772192197855, "learning_rate": 7.860429447852761e-06, "loss": 0.3946, "step": 1244 }, { "epoch": 2.572905894519131, "grad_norm": 0.24084960444000936, "learning_rate": 7.822085889570554e-06, "loss": 0.4025, "step": 1245 }, { "epoch": 2.574974146845915, "grad_norm": 0.2427154870866518, "learning_rate": 7.783742331288343e-06, "loss": 0.4333, "step": 1246 }, { "epoch": 2.577042399172699, "grad_norm": 0.2214449981423447, "learning_rate": 7.745398773006135e-06, "loss": 0.3283, "step": 1247 }, { "epoch": 2.579110651499483, "grad_norm": 0.25156864339486634, "learning_rate": 7.707055214723926e-06, "loss": 0.4468, "step": 1248 }, { "epoch": 2.5811789038262667, "grad_norm": 0.24912007519065174, "learning_rate": 7.668711656441718e-06, "loss": 0.3889, "step": 1249 }, { "epoch": 2.5832471561530506, "grad_norm": 0.2217204354067659, "learning_rate": 7.63036809815951e-06, "loss": 0.3877, "step": 1250 }, { "epoch": 2.5853154084798344, "grad_norm": 0.24749167771428826, "learning_rate": 7.592024539877301e-06, "loss": 0.3868, "step": 1251 }, { "epoch": 2.5873836608066183, "grad_norm": 2.085086346379053, "learning_rate": 7.553680981595092e-06, "loss": 0.471, "step": 1252 }, { "epoch": 2.589451913133402, "grad_norm": 0.2451609050223608, "learning_rate": 7.5153374233128836e-06, "loss": 0.4212, "step": 1253 }, { "epoch": 2.591520165460186, "grad_norm": 0.2558225933394133, "learning_rate": 7.476993865030675e-06, "loss": 0.4977, "step": 1254 }, { "epoch": 2.59358841778697, "grad_norm": 0.22997892942450365, "learning_rate": 7.438650306748467e-06, "loss": 0.3894, "step": 1255 }, { "epoch": 2.595656670113754, "grad_norm": 0.23484539569696872, "learning_rate": 7.400306748466258e-06, "loss": 0.4381, "step": 1256 }, { "epoch": 2.5977249224405377, "grad_norm": 0.2165766924948714, "learning_rate": 7.36196319018405e-06, "loss": 0.3754, "step": 1257 }, { "epoch": 2.5997931747673215, "grad_norm": 0.23350484020096793, "learning_rate": 7.3236196319018404e-06, "loss": 0.4455, "step": 1258 }, { "epoch": 2.6018614270941054, "grad_norm": 0.23670413798983533, "learning_rate": 7.285276073619632e-06, "loss": 0.349, "step": 1259 }, { "epoch": 2.6039296794208893, "grad_norm": 0.21747206161344818, "learning_rate": 7.2469325153374235e-06, "loss": 0.3413, "step": 1260 }, { "epoch": 2.605997931747673, "grad_norm": 0.24946198873981634, "learning_rate": 7.208588957055215e-06, "loss": 0.4394, "step": 1261 }, { "epoch": 2.608066184074457, "grad_norm": 0.21764787797900562, "learning_rate": 7.170245398773007e-06, "loss": 0.3678, "step": 1262 }, { "epoch": 2.610134436401241, "grad_norm": 0.22601292398603504, "learning_rate": 7.131901840490798e-06, "loss": 0.3373, "step": 1263 }, { "epoch": 2.6122026887280247, "grad_norm": 0.24030992054760522, "learning_rate": 7.09355828220859e-06, "loss": 0.3804, "step": 1264 }, { "epoch": 2.6142709410548086, "grad_norm": 0.2267271153708792, "learning_rate": 7.05521472392638e-06, "loss": 0.385, "step": 1265 }, { "epoch": 2.6163391933815925, "grad_norm": 0.250153855880448, "learning_rate": 7.016871165644172e-06, "loss": 0.3956, "step": 1266 }, { "epoch": 2.6184074457083764, "grad_norm": 0.23763280971324047, "learning_rate": 6.9785276073619635e-06, "loss": 0.4439, "step": 1267 }, { "epoch": 2.6204756980351602, "grad_norm": 0.22586380099303185, "learning_rate": 6.940184049079755e-06, "loss": 0.3912, "step": 1268 }, { "epoch": 2.622543950361944, "grad_norm": 0.24497268400316116, "learning_rate": 6.901840490797547e-06, "loss": 0.3961, "step": 1269 }, { "epoch": 2.624612202688728, "grad_norm": 0.242402761970362, "learning_rate": 6.863496932515338e-06, "loss": 0.4172, "step": 1270 }, { "epoch": 2.626680455015512, "grad_norm": 0.21470098103594537, "learning_rate": 6.825153374233129e-06, "loss": 0.3514, "step": 1271 }, { "epoch": 2.6287487073422957, "grad_norm": 0.2425018561405213, "learning_rate": 6.78680981595092e-06, "loss": 0.4224, "step": 1272 }, { "epoch": 2.6308169596690796, "grad_norm": 0.2390257826566082, "learning_rate": 6.748466257668712e-06, "loss": 0.3593, "step": 1273 }, { "epoch": 2.6328852119958635, "grad_norm": 0.23716157350818512, "learning_rate": 6.7101226993865035e-06, "loss": 0.371, "step": 1274 }, { "epoch": 2.6349534643226473, "grad_norm": 0.23903986564607183, "learning_rate": 6.671779141104295e-06, "loss": 0.4555, "step": 1275 }, { "epoch": 2.637021716649431, "grad_norm": 0.22598327695204942, "learning_rate": 6.6334355828220866e-06, "loss": 0.3707, "step": 1276 }, { "epoch": 2.639089968976215, "grad_norm": 0.24103737563731364, "learning_rate": 6.595092024539877e-06, "loss": 0.4396, "step": 1277 }, { "epoch": 2.641158221302999, "grad_norm": 0.22946623622361914, "learning_rate": 6.556748466257669e-06, "loss": 0.3673, "step": 1278 }, { "epoch": 2.643226473629783, "grad_norm": 0.24513767218458843, "learning_rate": 6.51840490797546e-06, "loss": 0.4293, "step": 1279 }, { "epoch": 2.6452947259565667, "grad_norm": 0.23144170875758502, "learning_rate": 6.480061349693252e-06, "loss": 0.4281, "step": 1280 }, { "epoch": 2.6473629782833505, "grad_norm": 0.23441979984142994, "learning_rate": 6.4417177914110434e-06, "loss": 0.3656, "step": 1281 }, { "epoch": 2.6494312306101344, "grad_norm": 0.2348451874222163, "learning_rate": 6.403374233128835e-06, "loss": 0.3766, "step": 1282 }, { "epoch": 2.6514994829369183, "grad_norm": 0.2175371843799904, "learning_rate": 6.365030674846626e-06, "loss": 0.3996, "step": 1283 }, { "epoch": 2.653567735263702, "grad_norm": 0.23692334559020709, "learning_rate": 6.326687116564417e-06, "loss": 0.4206, "step": 1284 }, { "epoch": 2.655635987590486, "grad_norm": 0.2490886909345003, "learning_rate": 6.288343558282209e-06, "loss": 0.4179, "step": 1285 }, { "epoch": 2.65770423991727, "grad_norm": 0.22602273164403555, "learning_rate": 6.25e-06, "loss": 0.3863, "step": 1286 }, { "epoch": 2.6597724922440538, "grad_norm": 0.22249304077896287, "learning_rate": 6.211656441717792e-06, "loss": 0.4295, "step": 1287 }, { "epoch": 2.6618407445708376, "grad_norm": 0.2267324641800898, "learning_rate": 6.1733128834355825e-06, "loss": 0.4523, "step": 1288 }, { "epoch": 2.6639089968976215, "grad_norm": 0.2482588683296294, "learning_rate": 6.134969325153374e-06, "loss": 0.4063, "step": 1289 }, { "epoch": 2.6659772492244054, "grad_norm": 0.2367037429167426, "learning_rate": 6.0966257668711665e-06, "loss": 0.4245, "step": 1290 }, { "epoch": 2.6680455015511892, "grad_norm": 0.22621037457609883, "learning_rate": 6.058282208588957e-06, "loss": 0.3684, "step": 1291 }, { "epoch": 2.670113753877973, "grad_norm": 0.23282307203734326, "learning_rate": 6.019938650306749e-06, "loss": 0.4595, "step": 1292 }, { "epoch": 2.672182006204757, "grad_norm": 0.22410072958587843, "learning_rate": 5.98159509202454e-06, "loss": 0.3656, "step": 1293 }, { "epoch": 2.674250258531541, "grad_norm": 0.25314067279115093, "learning_rate": 5.943251533742331e-06, "loss": 0.4134, "step": 1294 }, { "epoch": 2.6763185108583247, "grad_norm": 0.2314580776272318, "learning_rate": 5.9049079754601225e-06, "loss": 0.3925, "step": 1295 }, { "epoch": 2.6783867631851086, "grad_norm": 0.22070627169716936, "learning_rate": 5.866564417177915e-06, "loss": 0.3793, "step": 1296 }, { "epoch": 2.6804550155118925, "grad_norm": 0.21282236538535076, "learning_rate": 5.828220858895706e-06, "loss": 0.3557, "step": 1297 }, { "epoch": 2.6825232678386763, "grad_norm": 0.24193803902851324, "learning_rate": 5.789877300613497e-06, "loss": 0.3533, "step": 1298 }, { "epoch": 2.68459152016546, "grad_norm": 0.23673195720954027, "learning_rate": 5.751533742331289e-06, "loss": 0.386, "step": 1299 }, { "epoch": 2.686659772492244, "grad_norm": 0.2355005058862174, "learning_rate": 5.71319018404908e-06, "loss": 0.3797, "step": 1300 }, { "epoch": 2.688728024819028, "grad_norm": 0.25771475308182346, "learning_rate": 5.674846625766871e-06, "loss": 0.417, "step": 1301 }, { "epoch": 2.690796277145812, "grad_norm": 0.24668752310863026, "learning_rate": 5.636503067484663e-06, "loss": 0.4251, "step": 1302 }, { "epoch": 2.6928645294725957, "grad_norm": 0.24427448969253335, "learning_rate": 5.598159509202455e-06, "loss": 0.3922, "step": 1303 }, { "epoch": 2.6949327817993796, "grad_norm": 0.2333810345247235, "learning_rate": 5.5598159509202456e-06, "loss": 0.3498, "step": 1304 }, { "epoch": 2.6970010341261634, "grad_norm": 0.24960820196500413, "learning_rate": 5.521472392638037e-06, "loss": 0.4209, "step": 1305 }, { "epoch": 2.6990692864529473, "grad_norm": 0.23247499660281146, "learning_rate": 5.483128834355829e-06, "loss": 0.4, "step": 1306 }, { "epoch": 2.701137538779731, "grad_norm": 0.22747148198127945, "learning_rate": 5.444785276073619e-06, "loss": 0.4163, "step": 1307 }, { "epoch": 2.703205791106515, "grad_norm": 0.2367439058407517, "learning_rate": 5.406441717791412e-06, "loss": 0.402, "step": 1308 }, { "epoch": 2.705274043433299, "grad_norm": 0.23813857193470284, "learning_rate": 5.368098159509203e-06, "loss": 0.4059, "step": 1309 }, { "epoch": 2.707342295760083, "grad_norm": 0.24951137644587174, "learning_rate": 5.329754601226994e-06, "loss": 0.415, "step": 1310 }, { "epoch": 2.7094105480868667, "grad_norm": 0.24763200676073985, "learning_rate": 5.2914110429447855e-06, "loss": 0.422, "step": 1311 }, { "epoch": 2.7114788004136505, "grad_norm": 0.23957908049040108, "learning_rate": 5.253067484662577e-06, "loss": 0.4245, "step": 1312 }, { "epoch": 2.7135470527404344, "grad_norm": 0.24669512328867502, "learning_rate": 5.214723926380368e-06, "loss": 0.4014, "step": 1313 }, { "epoch": 2.7156153050672183, "grad_norm": 0.23103425048089932, "learning_rate": 5.176380368098159e-06, "loss": 0.3694, "step": 1314 }, { "epoch": 2.717683557394002, "grad_norm": 0.24799971384253977, "learning_rate": 5.138036809815952e-06, "loss": 0.3765, "step": 1315 }, { "epoch": 2.719751809720786, "grad_norm": 0.23910877355637675, "learning_rate": 5.099693251533742e-06, "loss": 0.4279, "step": 1316 }, { "epoch": 2.72182006204757, "grad_norm": 0.25994417268927617, "learning_rate": 5.061349693251534e-06, "loss": 0.4487, "step": 1317 }, { "epoch": 2.7238883143743537, "grad_norm": 0.2287367983507004, "learning_rate": 5.0230061349693255e-06, "loss": 0.3574, "step": 1318 }, { "epoch": 2.7259565667011376, "grad_norm": 0.23433774080624678, "learning_rate": 4.984662576687116e-06, "loss": 0.4248, "step": 1319 }, { "epoch": 2.7280248190279215, "grad_norm": 0.22856976655333866, "learning_rate": 4.946319018404908e-06, "loss": 0.4518, "step": 1320 }, { "epoch": 2.7300930713547054, "grad_norm": 0.244030520483506, "learning_rate": 4.9079754601227e-06, "loss": 0.4427, "step": 1321 }, { "epoch": 2.7321613236814892, "grad_norm": 0.2742862451031677, "learning_rate": 4.869631901840492e-06, "loss": 0.4731, "step": 1322 }, { "epoch": 2.734229576008273, "grad_norm": 0.24549835738211206, "learning_rate": 4.831288343558282e-06, "loss": 0.4216, "step": 1323 }, { "epoch": 2.736297828335057, "grad_norm": 0.24612245393582155, "learning_rate": 4.792944785276074e-06, "loss": 0.4126, "step": 1324 }, { "epoch": 2.738366080661841, "grad_norm": 0.23423630985532012, "learning_rate": 4.7546012269938654e-06, "loss": 0.3925, "step": 1325 }, { "epoch": 2.7404343329886247, "grad_norm": 0.23001946166243945, "learning_rate": 4.716257668711656e-06, "loss": 0.3952, "step": 1326 }, { "epoch": 2.7425025853154086, "grad_norm": 0.24694167332462102, "learning_rate": 4.6779141104294485e-06, "loss": 0.4277, "step": 1327 }, { "epoch": 2.7445708376421925, "grad_norm": 0.22208845148043277, "learning_rate": 4.63957055214724e-06, "loss": 0.3634, "step": 1328 }, { "epoch": 2.7466390899689763, "grad_norm": 0.24205397553870453, "learning_rate": 4.601226993865031e-06, "loss": 0.4252, "step": 1329 }, { "epoch": 2.74870734229576, "grad_norm": 0.23843807509362894, "learning_rate": 4.562883435582822e-06, "loss": 0.4644, "step": 1330 }, { "epoch": 2.750775594622544, "grad_norm": 0.22935045926869999, "learning_rate": 4.524539877300614e-06, "loss": 0.4032, "step": 1331 }, { "epoch": 2.752843846949328, "grad_norm": 0.23872836314165677, "learning_rate": 4.4861963190184046e-06, "loss": 0.4475, "step": 1332 }, { "epoch": 2.754912099276112, "grad_norm": 0.2210164404785503, "learning_rate": 4.447852760736196e-06, "loss": 0.4132, "step": 1333 }, { "epoch": 2.7569803516028957, "grad_norm": 0.23241972051317047, "learning_rate": 4.4095092024539885e-06, "loss": 0.3963, "step": 1334 }, { "epoch": 2.7590486039296795, "grad_norm": 0.20967664521294085, "learning_rate": 4.371165644171779e-06, "loss": 0.3967, "step": 1335 }, { "epoch": 2.7611168562564634, "grad_norm": 0.23324236555785433, "learning_rate": 4.332822085889571e-06, "loss": 0.4101, "step": 1336 }, { "epoch": 2.7631851085832473, "grad_norm": 0.23819602900054457, "learning_rate": 4.294478527607362e-06, "loss": 0.4452, "step": 1337 }, { "epoch": 2.765253360910031, "grad_norm": 0.23704758544719598, "learning_rate": 4.256134969325153e-06, "loss": 0.4228, "step": 1338 }, { "epoch": 2.767321613236815, "grad_norm": 0.2370895713407742, "learning_rate": 4.2177914110429445e-06, "loss": 0.4177, "step": 1339 }, { "epoch": 2.769389865563599, "grad_norm": 0.21883889467570455, "learning_rate": 4.179447852760737e-06, "loss": 0.4171, "step": 1340 }, { "epoch": 2.7714581178903828, "grad_norm": 2.796454595265003, "learning_rate": 4.141104294478528e-06, "loss": 0.3815, "step": 1341 }, { "epoch": 2.7735263702171666, "grad_norm": 0.21460079626401038, "learning_rate": 4.102760736196319e-06, "loss": 0.3387, "step": 1342 }, { "epoch": 2.7755946225439505, "grad_norm": 0.22404194718311257, "learning_rate": 4.064417177914111e-06, "loss": 0.3737, "step": 1343 }, { "epoch": 2.7776628748707344, "grad_norm": 0.2377686343060159, "learning_rate": 4.026073619631901e-06, "loss": 0.4028, "step": 1344 }, { "epoch": 2.7797311271975182, "grad_norm": 0.24407711016389058, "learning_rate": 3.987730061349693e-06, "loss": 0.4311, "step": 1345 }, { "epoch": 2.781799379524302, "grad_norm": 0.23666229447327472, "learning_rate": 3.949386503067485e-06, "loss": 0.4579, "step": 1346 }, { "epoch": 2.783867631851086, "grad_norm": 0.22575557971991095, "learning_rate": 3.911042944785277e-06, "loss": 0.4264, "step": 1347 }, { "epoch": 2.78593588417787, "grad_norm": 0.21756836318051992, "learning_rate": 3.872699386503068e-06, "loss": 0.3788, "step": 1348 }, { "epoch": 2.7880041365046537, "grad_norm": 0.23942637725283009, "learning_rate": 3.834355828220859e-06, "loss": 0.3996, "step": 1349 }, { "epoch": 2.7900723888314376, "grad_norm": 0.21555758774176778, "learning_rate": 3.7960122699386507e-06, "loss": 0.3593, "step": 1350 }, { "epoch": 2.7921406411582215, "grad_norm": 0.22032821744843778, "learning_rate": 3.7576687116564418e-06, "loss": 0.3351, "step": 1351 }, { "epoch": 2.7942088934850053, "grad_norm": 0.2196766546809181, "learning_rate": 3.7193251533742333e-06, "loss": 0.3819, "step": 1352 }, { "epoch": 2.796277145811789, "grad_norm": 0.2501221709153548, "learning_rate": 3.680981595092025e-06, "loss": 0.4149, "step": 1353 }, { "epoch": 2.798345398138573, "grad_norm": 0.21588338768229817, "learning_rate": 3.642638036809816e-06, "loss": 0.3908, "step": 1354 }, { "epoch": 2.800413650465357, "grad_norm": 0.24271984534677796, "learning_rate": 3.6042944785276075e-06, "loss": 0.5167, "step": 1355 }, { "epoch": 2.802481902792141, "grad_norm": 0.22023186061434885, "learning_rate": 3.565950920245399e-06, "loss": 0.4144, "step": 1356 }, { "epoch": 2.8045501551189247, "grad_norm": 0.22600736465506474, "learning_rate": 3.52760736196319e-06, "loss": 0.3848, "step": 1357 }, { "epoch": 2.8066184074457086, "grad_norm": 0.20722821167531108, "learning_rate": 3.4892638036809817e-06, "loss": 0.3392, "step": 1358 }, { "epoch": 2.8086866597724924, "grad_norm": 0.35031581839179826, "learning_rate": 3.4509202453987733e-06, "loss": 0.5139, "step": 1359 }, { "epoch": 2.8107549120992763, "grad_norm": 0.2284965933117697, "learning_rate": 3.4125766871165644e-06, "loss": 0.3773, "step": 1360 }, { "epoch": 2.81282316442606, "grad_norm": 0.22521873890080166, "learning_rate": 3.374233128834356e-06, "loss": 0.4199, "step": 1361 }, { "epoch": 2.814891416752844, "grad_norm": 0.2132151886479673, "learning_rate": 3.3358895705521475e-06, "loss": 0.36, "step": 1362 }, { "epoch": 2.816959669079628, "grad_norm": 0.22459512966263295, "learning_rate": 3.2975460122699386e-06, "loss": 0.3727, "step": 1363 }, { "epoch": 2.819027921406412, "grad_norm": 0.2206046227546624, "learning_rate": 3.25920245398773e-06, "loss": 0.3713, "step": 1364 }, { "epoch": 2.8210961737331957, "grad_norm": 0.24915384602483126, "learning_rate": 3.2208588957055217e-06, "loss": 0.4651, "step": 1365 }, { "epoch": 2.8231644260599795, "grad_norm": 0.22771851515009833, "learning_rate": 3.182515337423313e-06, "loss": 0.3928, "step": 1366 }, { "epoch": 2.8252326783867634, "grad_norm": 0.23611533987200461, "learning_rate": 3.1441717791411044e-06, "loss": 0.4271, "step": 1367 }, { "epoch": 2.8273009307135473, "grad_norm": 0.21693839815657986, "learning_rate": 3.105828220858896e-06, "loss": 0.3557, "step": 1368 }, { "epoch": 2.829369183040331, "grad_norm": 0.24064769080242146, "learning_rate": 3.067484662576687e-06, "loss": 0.4151, "step": 1369 }, { "epoch": 2.831437435367115, "grad_norm": 0.2275401644897199, "learning_rate": 3.0291411042944786e-06, "loss": 0.4186, "step": 1370 }, { "epoch": 2.833505687693899, "grad_norm": 0.22465251991951227, "learning_rate": 2.99079754601227e-06, "loss": 0.422, "step": 1371 }, { "epoch": 2.8355739400206827, "grad_norm": 0.21377156660400864, "learning_rate": 2.9524539877300613e-06, "loss": 0.3499, "step": 1372 }, { "epoch": 2.8376421923474666, "grad_norm": 0.2281536648772068, "learning_rate": 2.914110429447853e-06, "loss": 0.4283, "step": 1373 }, { "epoch": 2.8397104446742505, "grad_norm": 0.22640456233655462, "learning_rate": 2.8757668711656443e-06, "loss": 0.408, "step": 1374 }, { "epoch": 2.8417786970010344, "grad_norm": 0.23805722803635496, "learning_rate": 2.8374233128834355e-06, "loss": 0.4292, "step": 1375 }, { "epoch": 2.8438469493278182, "grad_norm": 0.23922312204571458, "learning_rate": 2.7990797546012274e-06, "loss": 0.3899, "step": 1376 }, { "epoch": 2.845915201654602, "grad_norm": 0.22339549632947864, "learning_rate": 2.7607361963190186e-06, "loss": 0.3896, "step": 1377 }, { "epoch": 2.847983453981386, "grad_norm": 0.22650670337028347, "learning_rate": 2.7223926380368097e-06, "loss": 0.4147, "step": 1378 }, { "epoch": 2.85005170630817, "grad_norm": 0.22966738408239787, "learning_rate": 2.6840490797546016e-06, "loss": 0.4231, "step": 1379 }, { "epoch": 2.8521199586349537, "grad_norm": 0.23023352959575627, "learning_rate": 2.6457055214723928e-06, "loss": 0.4317, "step": 1380 }, { "epoch": 2.8541882109617376, "grad_norm": 0.2365917164007588, "learning_rate": 2.607361963190184e-06, "loss": 0.4346, "step": 1381 }, { "epoch": 2.8562564632885215, "grad_norm": 0.22323855600267783, "learning_rate": 2.569018404907976e-06, "loss": 0.3918, "step": 1382 }, { "epoch": 2.8583247156153053, "grad_norm": 0.22075194705051918, "learning_rate": 2.530674846625767e-06, "loss": 0.4013, "step": 1383 }, { "epoch": 2.860392967942089, "grad_norm": 0.22204766763397463, "learning_rate": 2.492331288343558e-06, "loss": 0.4233, "step": 1384 }, { "epoch": 2.862461220268873, "grad_norm": 0.2441776528363303, "learning_rate": 2.45398773006135e-06, "loss": 0.4159, "step": 1385 }, { "epoch": 2.864529472595657, "grad_norm": 0.2197980522571548, "learning_rate": 2.415644171779141e-06, "loss": 0.3611, "step": 1386 }, { "epoch": 2.866597724922441, "grad_norm": 0.23114188351319995, "learning_rate": 2.3773006134969327e-06, "loss": 0.4187, "step": 1387 }, { "epoch": 2.8686659772492247, "grad_norm": 0.2365954075615859, "learning_rate": 2.3389570552147243e-06, "loss": 0.4223, "step": 1388 }, { "epoch": 2.8707342295760085, "grad_norm": 0.21809068902600087, "learning_rate": 2.3006134969325154e-06, "loss": 0.3614, "step": 1389 }, { "epoch": 2.8728024819027924, "grad_norm": 0.23283478401935656, "learning_rate": 2.262269938650307e-06, "loss": 0.4235, "step": 1390 }, { "epoch": 2.8748707342295763, "grad_norm": 0.2271115781425429, "learning_rate": 2.223926380368098e-06, "loss": 0.4207, "step": 1391 }, { "epoch": 2.8769389865563597, "grad_norm": 0.22793968417809696, "learning_rate": 2.1855828220858896e-06, "loss": 0.3775, "step": 1392 }, { "epoch": 2.8790072388831436, "grad_norm": 0.2596355278250528, "learning_rate": 2.147239263803681e-06, "loss": 0.5225, "step": 1393 }, { "epoch": 2.8810754912099275, "grad_norm": 0.2422456861893422, "learning_rate": 2.1088957055214723e-06, "loss": 0.436, "step": 1394 }, { "epoch": 2.8831437435367113, "grad_norm": 0.24567849162951397, "learning_rate": 2.070552147239264e-06, "loss": 0.4129, "step": 1395 }, { "epoch": 2.885211995863495, "grad_norm": 0.22394689379271054, "learning_rate": 2.0322085889570554e-06, "loss": 0.3734, "step": 1396 }, { "epoch": 2.887280248190279, "grad_norm": 0.2200905323086683, "learning_rate": 1.9938650306748465e-06, "loss": 0.3482, "step": 1397 }, { "epoch": 2.889348500517063, "grad_norm": 0.22927654628479927, "learning_rate": 1.9555214723926384e-06, "loss": 0.4075, "step": 1398 }, { "epoch": 2.891416752843847, "grad_norm": 0.22440255693268946, "learning_rate": 1.9171779141104296e-06, "loss": 0.3694, "step": 1399 }, { "epoch": 2.8934850051706307, "grad_norm": 0.23562730416485717, "learning_rate": 1.8788343558282209e-06, "loss": 0.3614, "step": 1400 }, { "epoch": 2.8955532574974145, "grad_norm": 0.2344117665409843, "learning_rate": 1.8404907975460124e-06, "loss": 0.3878, "step": 1401 }, { "epoch": 2.8976215098241984, "grad_norm": 0.22237711463014465, "learning_rate": 1.8021472392638038e-06, "loss": 0.3953, "step": 1402 }, { "epoch": 2.8996897621509823, "grad_norm": 0.23864425283784793, "learning_rate": 1.763803680981595e-06, "loss": 0.4499, "step": 1403 }, { "epoch": 2.901758014477766, "grad_norm": 0.23449582351919557, "learning_rate": 1.7254601226993866e-06, "loss": 0.4092, "step": 1404 }, { "epoch": 2.90382626680455, "grad_norm": 0.22531610503722208, "learning_rate": 1.687116564417178e-06, "loss": 0.4072, "step": 1405 }, { "epoch": 2.905894519131334, "grad_norm": 0.2182796818604039, "learning_rate": 1.6487730061349693e-06, "loss": 0.4299, "step": 1406 }, { "epoch": 2.9079627714581178, "grad_norm": 0.2268563575955579, "learning_rate": 1.6104294478527609e-06, "loss": 0.4115, "step": 1407 }, { "epoch": 2.9100310237849016, "grad_norm": 0.21641892434036142, "learning_rate": 1.5720858895705522e-06, "loss": 0.3696, "step": 1408 }, { "epoch": 2.9120992761116855, "grad_norm": 0.22867717715779334, "learning_rate": 1.5337423312883435e-06, "loss": 0.4204, "step": 1409 }, { "epoch": 2.9141675284384694, "grad_norm": 0.21473882732111155, "learning_rate": 1.495398773006135e-06, "loss": 0.3802, "step": 1410 }, { "epoch": 2.9162357807652532, "grad_norm": 0.23483406882391122, "learning_rate": 1.4570552147239264e-06, "loss": 0.4411, "step": 1411 }, { "epoch": 2.918304033092037, "grad_norm": 0.22441897206749462, "learning_rate": 1.4187116564417177e-06, "loss": 0.3948, "step": 1412 }, { "epoch": 2.920372285418821, "grad_norm": 0.24299276980423967, "learning_rate": 1.3803680981595093e-06, "loss": 0.4219, "step": 1413 }, { "epoch": 2.922440537745605, "grad_norm": 0.23403569573759542, "learning_rate": 1.3420245398773008e-06, "loss": 0.4016, "step": 1414 }, { "epoch": 2.9245087900723887, "grad_norm": 0.21743740164485792, "learning_rate": 1.303680981595092e-06, "loss": 0.3784, "step": 1415 }, { "epoch": 2.9265770423991726, "grad_norm": 0.22220137610806004, "learning_rate": 1.2653374233128835e-06, "loss": 0.396, "step": 1416 }, { "epoch": 2.9286452947259565, "grad_norm": 0.20902748885818637, "learning_rate": 1.226993865030675e-06, "loss": 0.3451, "step": 1417 }, { "epoch": 2.9307135470527403, "grad_norm": 0.21245690178479318, "learning_rate": 1.1886503067484664e-06, "loss": 0.382, "step": 1418 }, { "epoch": 2.932781799379524, "grad_norm": 0.20209347644464912, "learning_rate": 1.1503067484662577e-06, "loss": 0.3327, "step": 1419 }, { "epoch": 2.934850051706308, "grad_norm": 0.21420610997477973, "learning_rate": 1.111963190184049e-06, "loss": 0.3735, "step": 1420 }, { "epoch": 2.936918304033092, "grad_norm": 0.21869508136281401, "learning_rate": 1.0736196319018406e-06, "loss": 0.3855, "step": 1421 }, { "epoch": 2.938986556359876, "grad_norm": 0.21334109554533312, "learning_rate": 1.035276073619632e-06, "loss": 0.4237, "step": 1422 }, { "epoch": 2.9410548086866597, "grad_norm": 0.23744692747267726, "learning_rate": 9.969325153374232e-07, "loss": 0.437, "step": 1423 }, { "epoch": 2.9431230610134436, "grad_norm": 0.22033677076405395, "learning_rate": 9.585889570552148e-07, "loss": 0.4581, "step": 1424 }, { "epoch": 2.9451913133402274, "grad_norm": 0.2195543155656596, "learning_rate": 9.202453987730062e-07, "loss": 0.4033, "step": 1425 }, { "epoch": 2.9472595656670113, "grad_norm": 0.2370942708259419, "learning_rate": 8.819018404907976e-07, "loss": 0.4657, "step": 1426 }, { "epoch": 2.949327817993795, "grad_norm": 0.23445890566583089, "learning_rate": 8.43558282208589e-07, "loss": 0.4266, "step": 1427 }, { "epoch": 2.951396070320579, "grad_norm": 0.22000734388909823, "learning_rate": 8.052147239263804e-07, "loss": 0.434, "step": 1428 }, { "epoch": 2.953464322647363, "grad_norm": 0.22211045596613316, "learning_rate": 7.668711656441718e-07, "loss": 0.4332, "step": 1429 }, { "epoch": 2.955532574974147, "grad_norm": 0.20754599233601612, "learning_rate": 7.285276073619632e-07, "loss": 0.3672, "step": 1430 }, { "epoch": 2.9576008273009307, "grad_norm": 0.2262766383644341, "learning_rate": 6.901840490797546e-07, "loss": 0.4378, "step": 1431 }, { "epoch": 2.9596690796277145, "grad_norm": 0.2152273988690833, "learning_rate": 6.51840490797546e-07, "loss": 0.3686, "step": 1432 }, { "epoch": 2.9617373319544984, "grad_norm": 0.21391839066045834, "learning_rate": 6.134969325153375e-07, "loss": 0.3816, "step": 1433 }, { "epoch": 2.9638055842812823, "grad_norm": 0.21768246522081797, "learning_rate": 5.751533742331288e-07, "loss": 0.3657, "step": 1434 }, { "epoch": 2.965873836608066, "grad_norm": 0.21732342476182454, "learning_rate": 5.368098159509203e-07, "loss": 0.3871, "step": 1435 }, { "epoch": 2.96794208893485, "grad_norm": 0.251886240537341, "learning_rate": 4.984662576687116e-07, "loss": 0.4063, "step": 1436 }, { "epoch": 2.970010341261634, "grad_norm": 0.22138070049770045, "learning_rate": 4.601226993865031e-07, "loss": 0.4092, "step": 1437 }, { "epoch": 2.9720785935884177, "grad_norm": 0.2269882997467649, "learning_rate": 4.217791411042945e-07, "loss": 0.4061, "step": 1438 }, { "epoch": 2.9741468459152016, "grad_norm": 0.238387212851704, "learning_rate": 3.834355828220859e-07, "loss": 0.4232, "step": 1439 }, { "epoch": 2.9762150982419855, "grad_norm": 0.23638134351222842, "learning_rate": 3.450920245398773e-07, "loss": 0.4223, "step": 1440 }, { "epoch": 2.9782833505687694, "grad_norm": 0.21215599709862457, "learning_rate": 3.0674846625766876e-07, "loss": 0.3934, "step": 1441 }, { "epoch": 2.9803516028955532, "grad_norm": 0.20974198457812068, "learning_rate": 2.6840490797546014e-07, "loss": 0.3704, "step": 1442 }, { "epoch": 2.982419855222337, "grad_norm": 0.21225301198402077, "learning_rate": 2.3006134969325155e-07, "loss": 0.3847, "step": 1443 }, { "epoch": 2.984488107549121, "grad_norm": 0.23183285816406723, "learning_rate": 1.9171779141104294e-07, "loss": 0.4339, "step": 1444 }, { "epoch": 2.986556359875905, "grad_norm": 0.2094212891628721, "learning_rate": 1.5337423312883438e-07, "loss": 0.3801, "step": 1445 }, { "epoch": 2.9886246122026887, "grad_norm": 0.23104007749070318, "learning_rate": 1.1503067484662578e-07, "loss": 0.4296, "step": 1446 }, { "epoch": 2.9906928645294726, "grad_norm": 0.23103066860301902, "learning_rate": 7.668711656441719e-08, "loss": 0.4379, "step": 1447 }, { "epoch": 2.9927611168562565, "grad_norm": 0.21313341693687393, "learning_rate": 3.8343558282208595e-08, "loss": 0.3964, "step": 1448 }, { "epoch": 2.9948293691830403, "grad_norm": 0.20602917187196162, "learning_rate": 0.0, "loss": 0.3886, "step": 1449 }, { "epoch": 2.9948293691830403, "step": 1449, "total_flos": 1.2235415373217792e+18, "train_loss": 0.6515768039136035, "train_runtime": 84025.0926, "train_samples_per_second": 0.276, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 1449, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2235415373217792e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }