zhoujun commited on
Commit
5055411
·
verified ·
1 Parent(s): 44a9172

Upload trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +1505 -0
trainer_state.json ADDED
@@ -0,0 +1,1505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 184,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.010869565217391304,
13
+ "grad_norm": 4.7562408971323125,
14
+ "learning_rate": 1.0526315789473685e-06,
15
+ "loss": 1.4045,
16
+ "num_tokens": 90966.0,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.021739130434782608,
21
+ "grad_norm": 4.856641936347526,
22
+ "learning_rate": 2.105263157894737e-06,
23
+ "loss": 1.4286,
24
+ "num_tokens": 176808.0,
25
+ "step": 2
26
+ },
27
+ {
28
+ "epoch": 0.03260869565217391,
29
+ "grad_norm": 4.632629643475528,
30
+ "learning_rate": 3.157894736842105e-06,
31
+ "loss": 1.3767,
32
+ "num_tokens": 268011.0,
33
+ "step": 3
34
+ },
35
+ {
36
+ "epoch": 0.043478260869565216,
37
+ "grad_norm": 4.433390852741152,
38
+ "learning_rate": 4.210526315789474e-06,
39
+ "loss": 1.3293,
40
+ "num_tokens": 369286.0,
41
+ "step": 4
42
+ },
43
+ {
44
+ "epoch": 0.05434782608695652,
45
+ "grad_norm": 4.267197673905712,
46
+ "learning_rate": 5.263157894736842e-06,
47
+ "loss": 1.3839,
48
+ "num_tokens": 465540.0,
49
+ "step": 5
50
+ },
51
+ {
52
+ "epoch": 0.06521739130434782,
53
+ "grad_norm": 3.717050440728708,
54
+ "learning_rate": 6.31578947368421e-06,
55
+ "loss": 1.2858,
56
+ "num_tokens": 567659.0,
57
+ "step": 6
58
+ },
59
+ {
60
+ "epoch": 0.07608695652173914,
61
+ "grad_norm": 3.180745867673301,
62
+ "learning_rate": 7.368421052631579e-06,
63
+ "loss": 1.188,
64
+ "num_tokens": 665757.0,
65
+ "step": 7
66
+ },
67
+ {
68
+ "epoch": 0.08695652173913043,
69
+ "grad_norm": 2.756203802259368,
70
+ "learning_rate": 8.421052631578948e-06,
71
+ "loss": 1.2036,
72
+ "num_tokens": 764351.0,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.09782608695652174,
77
+ "grad_norm": 2.722957346536631,
78
+ "learning_rate": 9.473684210526315e-06,
79
+ "loss": 1.1352,
80
+ "num_tokens": 859181.0,
81
+ "step": 9
82
+ },
83
+ {
84
+ "epoch": 0.10869565217391304,
85
+ "grad_norm": 3.149236707148637,
86
+ "learning_rate": 1.0526315789473684e-05,
87
+ "loss": 1.0399,
88
+ "num_tokens": 958891.0,
89
+ "step": 10
90
+ },
91
+ {
92
+ "epoch": 0.11956521739130435,
93
+ "grad_norm": 2.5830391272833104,
94
+ "learning_rate": 1.1578947368421053e-05,
95
+ "loss": 1.0534,
96
+ "num_tokens": 1057964.0,
97
+ "step": 11
98
+ },
99
+ {
100
+ "epoch": 0.13043478260869565,
101
+ "grad_norm": 2.1386887474211687,
102
+ "learning_rate": 1.263157894736842e-05,
103
+ "loss": 0.9946,
104
+ "num_tokens": 1150907.0,
105
+ "step": 12
106
+ },
107
+ {
108
+ "epoch": 0.14130434782608695,
109
+ "grad_norm": 2.211121746514346,
110
+ "learning_rate": 1.3684210526315791e-05,
111
+ "loss": 1.0225,
112
+ "num_tokens": 1238881.0,
113
+ "step": 13
114
+ },
115
+ {
116
+ "epoch": 0.15217391304347827,
117
+ "grad_norm": 2.0317494078733,
118
+ "learning_rate": 1.4736842105263159e-05,
119
+ "loss": 0.9779,
120
+ "num_tokens": 1336961.0,
121
+ "step": 14
122
+ },
123
+ {
124
+ "epoch": 0.16304347826086957,
125
+ "grad_norm": 1.6535140053442847,
126
+ "learning_rate": 1.578947368421053e-05,
127
+ "loss": 0.9645,
128
+ "num_tokens": 1427193.0,
129
+ "step": 15
130
+ },
131
+ {
132
+ "epoch": 0.17391304347826086,
133
+ "grad_norm": 3.7204342856811095,
134
+ "learning_rate": 1.6842105263157896e-05,
135
+ "loss": 0.9152,
136
+ "num_tokens": 1519751.0,
137
+ "step": 16
138
+ },
139
+ {
140
+ "epoch": 0.18478260869565216,
141
+ "grad_norm": 1.5902509836923946,
142
+ "learning_rate": 1.7894736842105264e-05,
143
+ "loss": 0.9191,
144
+ "num_tokens": 1616777.0,
145
+ "step": 17
146
+ },
147
+ {
148
+ "epoch": 0.1956521739130435,
149
+ "grad_norm": 1.4483857314150967,
150
+ "learning_rate": 1.894736842105263e-05,
151
+ "loss": 0.8791,
152
+ "num_tokens": 1714839.0,
153
+ "step": 18
154
+ },
155
+ {
156
+ "epoch": 0.20652173913043478,
157
+ "grad_norm": 1.3039225121757552,
158
+ "learning_rate": 2e-05,
159
+ "loss": 0.9332,
160
+ "num_tokens": 1815763.0,
161
+ "step": 19
162
+ },
163
+ {
164
+ "epoch": 0.21739130434782608,
165
+ "grad_norm": 1.3126973712776373,
166
+ "learning_rate": 1.9878787878787878e-05,
167
+ "loss": 0.8782,
168
+ "num_tokens": 1908693.0,
169
+ "step": 20
170
+ },
171
+ {
172
+ "epoch": 0.22826086956521738,
173
+ "grad_norm": 1.4053049528153356,
174
+ "learning_rate": 1.975757575757576e-05,
175
+ "loss": 0.9232,
176
+ "num_tokens": 2004172.0,
177
+ "step": 21
178
+ },
179
+ {
180
+ "epoch": 0.2391304347826087,
181
+ "grad_norm": 1.1849138803900574,
182
+ "learning_rate": 1.963636363636364e-05,
183
+ "loss": 0.873,
184
+ "num_tokens": 2119371.0,
185
+ "step": 22
186
+ },
187
+ {
188
+ "epoch": 0.25,
189
+ "grad_norm": 2.01085287355782,
190
+ "learning_rate": 1.9515151515151515e-05,
191
+ "loss": 0.9279,
192
+ "num_tokens": 2212596.0,
193
+ "step": 23
194
+ },
195
+ {
196
+ "epoch": 0.2608695652173913,
197
+ "grad_norm": 1.2458008758973527,
198
+ "learning_rate": 1.9393939393939395e-05,
199
+ "loss": 0.8654,
200
+ "num_tokens": 2294656.0,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 0.2717391304347826,
205
+ "grad_norm": 1.338715786372464,
206
+ "learning_rate": 1.9272727272727275e-05,
207
+ "loss": 0.8992,
208
+ "num_tokens": 2391442.0,
209
+ "step": 25
210
+ },
211
+ {
212
+ "epoch": 0.2826086956521739,
213
+ "grad_norm": 1.0044343113961611,
214
+ "learning_rate": 1.9151515151515152e-05,
215
+ "loss": 0.8313,
216
+ "num_tokens": 2494467.0,
217
+ "step": 26
218
+ },
219
+ {
220
+ "epoch": 0.29347826086956524,
221
+ "grad_norm": 1.1561251486922257,
222
+ "learning_rate": 1.9030303030303032e-05,
223
+ "loss": 0.8516,
224
+ "num_tokens": 2584063.0,
225
+ "step": 27
226
+ },
227
+ {
228
+ "epoch": 0.30434782608695654,
229
+ "grad_norm": 1.132844292406038,
230
+ "learning_rate": 1.8909090909090912e-05,
231
+ "loss": 0.7765,
232
+ "num_tokens": 2670487.0,
233
+ "step": 28
234
+ },
235
+ {
236
+ "epoch": 0.31521739130434784,
237
+ "grad_norm": 1.0830282801682392,
238
+ "learning_rate": 1.8787878787878792e-05,
239
+ "loss": 0.855,
240
+ "num_tokens": 2762813.0,
241
+ "step": 29
242
+ },
243
+ {
244
+ "epoch": 0.32608695652173914,
245
+ "grad_norm": 1.0350686000005194,
246
+ "learning_rate": 1.866666666666667e-05,
247
+ "loss": 0.8527,
248
+ "num_tokens": 2850331.0,
249
+ "step": 30
250
+ },
251
+ {
252
+ "epoch": 0.33695652173913043,
253
+ "grad_norm": 1.0369499924518195,
254
+ "learning_rate": 1.8545454545454545e-05,
255
+ "loss": 0.7974,
256
+ "num_tokens": 2942596.0,
257
+ "step": 31
258
+ },
259
+ {
260
+ "epoch": 0.34782608695652173,
261
+ "grad_norm": 1.1454834580125053,
262
+ "learning_rate": 1.8424242424242425e-05,
263
+ "loss": 0.8509,
264
+ "num_tokens": 3039852.0,
265
+ "step": 32
266
+ },
267
+ {
268
+ "epoch": 0.358695652173913,
269
+ "grad_norm": 1.0966923132147153,
270
+ "learning_rate": 1.8303030303030305e-05,
271
+ "loss": 0.8426,
272
+ "num_tokens": 3136550.0,
273
+ "step": 33
274
+ },
275
+ {
276
+ "epoch": 0.3695652173913043,
277
+ "grad_norm": 0.9615713216664558,
278
+ "learning_rate": 1.8181818181818182e-05,
279
+ "loss": 0.8209,
280
+ "num_tokens": 3226573.0,
281
+ "step": 34
282
+ },
283
+ {
284
+ "epoch": 0.3804347826086957,
285
+ "grad_norm": 1.0463868635910665,
286
+ "learning_rate": 1.8060606060606062e-05,
287
+ "loss": 0.8322,
288
+ "num_tokens": 3319993.0,
289
+ "step": 35
290
+ },
291
+ {
292
+ "epoch": 0.391304347826087,
293
+ "grad_norm": 0.9016694267436932,
294
+ "learning_rate": 1.7939393939393942e-05,
295
+ "loss": 0.8494,
296
+ "num_tokens": 3409783.0,
297
+ "step": 36
298
+ },
299
+ {
300
+ "epoch": 0.40217391304347827,
301
+ "grad_norm": 0.8120321544484976,
302
+ "learning_rate": 1.781818181818182e-05,
303
+ "loss": 0.8246,
304
+ "num_tokens": 3508366.0,
305
+ "step": 37
306
+ },
307
+ {
308
+ "epoch": 0.41304347826086957,
309
+ "grad_norm": 0.7992720431772786,
310
+ "learning_rate": 1.76969696969697e-05,
311
+ "loss": 0.7703,
312
+ "num_tokens": 3602876.0,
313
+ "step": 38
314
+ },
315
+ {
316
+ "epoch": 0.42391304347826086,
317
+ "grad_norm": 0.7908788366960535,
318
+ "learning_rate": 1.7575757575757576e-05,
319
+ "loss": 0.8249,
320
+ "num_tokens": 3697241.0,
321
+ "step": 39
322
+ },
323
+ {
324
+ "epoch": 0.43478260869565216,
325
+ "grad_norm": 0.8154728597756308,
326
+ "learning_rate": 1.7454545454545456e-05,
327
+ "loss": 0.825,
328
+ "num_tokens": 3783861.0,
329
+ "step": 40
330
+ },
331
+ {
332
+ "epoch": 0.44565217391304346,
333
+ "grad_norm": 0.8244532763608124,
334
+ "learning_rate": 1.7333333333333336e-05,
335
+ "loss": 0.845,
336
+ "num_tokens": 3884088.0,
337
+ "step": 41
338
+ },
339
+ {
340
+ "epoch": 0.45652173913043476,
341
+ "grad_norm": 0.7694222272972445,
342
+ "learning_rate": 1.7212121212121212e-05,
343
+ "loss": 0.7804,
344
+ "num_tokens": 3972883.0,
345
+ "step": 42
346
+ },
347
+ {
348
+ "epoch": 0.4673913043478261,
349
+ "grad_norm": 0.7461120769896388,
350
+ "learning_rate": 1.7090909090909092e-05,
351
+ "loss": 0.7936,
352
+ "num_tokens": 4065958.0,
353
+ "step": 43
354
+ },
355
+ {
356
+ "epoch": 0.4782608695652174,
357
+ "grad_norm": 0.7808686279876842,
358
+ "learning_rate": 1.6969696969696972e-05,
359
+ "loss": 0.8452,
360
+ "num_tokens": 4164566.0,
361
+ "step": 44
362
+ },
363
+ {
364
+ "epoch": 0.4891304347826087,
365
+ "grad_norm": 0.7343804102094451,
366
+ "learning_rate": 1.684848484848485e-05,
367
+ "loss": 0.8031,
368
+ "num_tokens": 4254036.0,
369
+ "step": 45
370
+ },
371
+ {
372
+ "epoch": 0.5,
373
+ "grad_norm": 0.7439703254330587,
374
+ "learning_rate": 1.672727272727273e-05,
375
+ "loss": 0.8043,
376
+ "num_tokens": 4340424.0,
377
+ "step": 46
378
+ },
379
+ {
380
+ "epoch": 0.5108695652173914,
381
+ "grad_norm": 0.7396028883918819,
382
+ "learning_rate": 1.660606060606061e-05,
383
+ "loss": 0.7837,
384
+ "num_tokens": 4436870.0,
385
+ "step": 47
386
+ },
387
+ {
388
+ "epoch": 0.5217391304347826,
389
+ "grad_norm": 0.7608846409786219,
390
+ "learning_rate": 1.6484848484848486e-05,
391
+ "loss": 0.8515,
392
+ "num_tokens": 4530847.0,
393
+ "step": 48
394
+ },
395
+ {
396
+ "epoch": 0.532608695652174,
397
+ "grad_norm": 0.7181352448640346,
398
+ "learning_rate": 1.6363636363636366e-05,
399
+ "loss": 0.8198,
400
+ "num_tokens": 4624892.0,
401
+ "step": 49
402
+ },
403
+ {
404
+ "epoch": 0.5434782608695652,
405
+ "grad_norm": 0.7087375538457228,
406
+ "learning_rate": 1.6242424242424243e-05,
407
+ "loss": 0.8,
408
+ "num_tokens": 4726543.0,
409
+ "step": 50
410
+ },
411
+ {
412
+ "epoch": 0.5543478260869565,
413
+ "grad_norm": 0.7463024608938168,
414
+ "learning_rate": 1.6121212121212123e-05,
415
+ "loss": 0.7789,
416
+ "num_tokens": 4819956.0,
417
+ "step": 51
418
+ },
419
+ {
420
+ "epoch": 0.5652173913043478,
421
+ "grad_norm": 0.7442588718390604,
422
+ "learning_rate": 1.6000000000000003e-05,
423
+ "loss": 0.8054,
424
+ "num_tokens": 4917035.0,
425
+ "step": 52
426
+ },
427
+ {
428
+ "epoch": 0.5760869565217391,
429
+ "grad_norm": 0.7183312987900197,
430
+ "learning_rate": 1.587878787878788e-05,
431
+ "loss": 0.7923,
432
+ "num_tokens": 5017078.0,
433
+ "step": 53
434
+ },
435
+ {
436
+ "epoch": 0.5869565217391305,
437
+ "grad_norm": 0.7212235895921912,
438
+ "learning_rate": 1.575757575757576e-05,
439
+ "loss": 0.7816,
440
+ "num_tokens": 5110982.0,
441
+ "step": 54
442
+ },
443
+ {
444
+ "epoch": 0.5978260869565217,
445
+ "grad_norm": 0.6984711344390122,
446
+ "learning_rate": 1.563636363636364e-05,
447
+ "loss": 0.802,
448
+ "num_tokens": 5200799.0,
449
+ "step": 55
450
+ },
451
+ {
452
+ "epoch": 0.6086956521739131,
453
+ "grad_norm": 0.7262030804489741,
454
+ "learning_rate": 1.5515151515151516e-05,
455
+ "loss": 0.7887,
456
+ "num_tokens": 5296443.0,
457
+ "step": 56
458
+ },
459
+ {
460
+ "epoch": 0.6195652173913043,
461
+ "grad_norm": 0.7665109508827059,
462
+ "learning_rate": 1.5393939393939393e-05,
463
+ "loss": 0.8081,
464
+ "num_tokens": 5392587.0,
465
+ "step": 57
466
+ },
467
+ {
468
+ "epoch": 0.6304347826086957,
469
+ "grad_norm": 0.7222105761333159,
470
+ "learning_rate": 1.5272727272727276e-05,
471
+ "loss": 0.7779,
472
+ "num_tokens": 5487722.0,
473
+ "step": 58
474
+ },
475
+ {
476
+ "epoch": 0.6413043478260869,
477
+ "grad_norm": 0.7297075918942776,
478
+ "learning_rate": 1.5151515151515153e-05,
479
+ "loss": 0.7518,
480
+ "num_tokens": 5585642.0,
481
+ "step": 59
482
+ },
483
+ {
484
+ "epoch": 0.6521739130434783,
485
+ "grad_norm": 0.7571906625686418,
486
+ "learning_rate": 1.5030303030303031e-05,
487
+ "loss": 0.7858,
488
+ "num_tokens": 5675685.0,
489
+ "step": 60
490
+ },
491
+ {
492
+ "epoch": 0.6630434782608695,
493
+ "grad_norm": 0.7361173105602014,
494
+ "learning_rate": 1.4909090909090911e-05,
495
+ "loss": 0.8163,
496
+ "num_tokens": 5766441.0,
497
+ "step": 61
498
+ },
499
+ {
500
+ "epoch": 0.6739130434782609,
501
+ "grad_norm": 0.7105038659620458,
502
+ "learning_rate": 1.478787878787879e-05,
503
+ "loss": 0.7505,
504
+ "num_tokens": 5856978.0,
505
+ "step": 62
506
+ },
507
+ {
508
+ "epoch": 0.6847826086956522,
509
+ "grad_norm": 0.6825484337570947,
510
+ "learning_rate": 1.4666666666666666e-05,
511
+ "loss": 0.7751,
512
+ "num_tokens": 5941585.0,
513
+ "step": 63
514
+ },
515
+ {
516
+ "epoch": 0.6956521739130435,
517
+ "grad_norm": 0.7314955103489276,
518
+ "learning_rate": 1.4545454545454546e-05,
519
+ "loss": 0.8163,
520
+ "num_tokens": 6043058.0,
521
+ "step": 64
522
+ },
523
+ {
524
+ "epoch": 0.7065217391304348,
525
+ "grad_norm": 0.7103929889225186,
526
+ "learning_rate": 1.4424242424242425e-05,
527
+ "loss": 0.7832,
528
+ "num_tokens": 6147716.0,
529
+ "step": 65
530
+ },
531
+ {
532
+ "epoch": 0.717391304347826,
533
+ "grad_norm": 0.6793775565981599,
534
+ "learning_rate": 1.4303030303030305e-05,
535
+ "loss": 0.7264,
536
+ "num_tokens": 6238620.0,
537
+ "step": 66
538
+ },
539
+ {
540
+ "epoch": 0.7282608695652174,
541
+ "grad_norm": 0.7124999007971334,
542
+ "learning_rate": 1.4181818181818183e-05,
543
+ "loss": 0.7876,
544
+ "num_tokens": 6339589.0,
545
+ "step": 67
546
+ },
547
+ {
548
+ "epoch": 0.7391304347826086,
549
+ "grad_norm": 0.6758760592365682,
550
+ "learning_rate": 1.4060606060606061e-05,
551
+ "loss": 0.8137,
552
+ "num_tokens": 6452148.0,
553
+ "step": 68
554
+ },
555
+ {
556
+ "epoch": 0.75,
557
+ "grad_norm": 0.7083257729536743,
558
+ "learning_rate": 1.3939393939393942e-05,
559
+ "loss": 0.7611,
560
+ "num_tokens": 6546822.0,
561
+ "step": 69
562
+ },
563
+ {
564
+ "epoch": 0.7608695652173914,
565
+ "grad_norm": 0.6894843161471015,
566
+ "learning_rate": 1.381818181818182e-05,
567
+ "loss": 0.8409,
568
+ "num_tokens": 6652603.0,
569
+ "step": 70
570
+ },
571
+ {
572
+ "epoch": 0.7717391304347826,
573
+ "grad_norm": 0.7320922145311975,
574
+ "learning_rate": 1.3696969696969698e-05,
575
+ "loss": 0.7523,
576
+ "num_tokens": 6745015.0,
577
+ "step": 71
578
+ },
579
+ {
580
+ "epoch": 0.782608695652174,
581
+ "grad_norm": 0.7040695482776516,
582
+ "learning_rate": 1.3575757575757578e-05,
583
+ "loss": 0.7612,
584
+ "num_tokens": 6838204.0,
585
+ "step": 72
586
+ },
587
+ {
588
+ "epoch": 0.7934782608695652,
589
+ "grad_norm": 0.6847801952071528,
590
+ "learning_rate": 1.3454545454545455e-05,
591
+ "loss": 0.8138,
592
+ "num_tokens": 6933388.0,
593
+ "step": 73
594
+ },
595
+ {
596
+ "epoch": 0.8043478260869565,
597
+ "grad_norm": 0.7550074032673957,
598
+ "learning_rate": 1.3333333333333333e-05,
599
+ "loss": 0.7649,
600
+ "num_tokens": 7029939.0,
601
+ "step": 74
602
+ },
603
+ {
604
+ "epoch": 0.8152173913043478,
605
+ "grad_norm": 0.6925774304998896,
606
+ "learning_rate": 1.3212121212121213e-05,
607
+ "loss": 0.7576,
608
+ "num_tokens": 7128091.0,
609
+ "step": 75
610
+ },
611
+ {
612
+ "epoch": 0.8260869565217391,
613
+ "grad_norm": 0.7562225545168595,
614
+ "learning_rate": 1.3090909090909092e-05,
615
+ "loss": 0.7916,
616
+ "num_tokens": 7214845.0,
617
+ "step": 76
618
+ },
619
+ {
620
+ "epoch": 0.8369565217391305,
621
+ "grad_norm": 0.6510325534353791,
622
+ "learning_rate": 1.296969696969697e-05,
623
+ "loss": 0.7825,
624
+ "num_tokens": 7323115.0,
625
+ "step": 77
626
+ },
627
+ {
628
+ "epoch": 0.8478260869565217,
629
+ "grad_norm": 0.7026385885074842,
630
+ "learning_rate": 1.284848484848485e-05,
631
+ "loss": 0.7866,
632
+ "num_tokens": 7420683.0,
633
+ "step": 78
634
+ },
635
+ {
636
+ "epoch": 0.8586956521739131,
637
+ "grad_norm": 0.7213706799641426,
638
+ "learning_rate": 1.2727272727272728e-05,
639
+ "loss": 0.7789,
640
+ "num_tokens": 7504805.0,
641
+ "step": 79
642
+ },
643
+ {
644
+ "epoch": 0.8695652173913043,
645
+ "grad_norm": 0.7056975956417906,
646
+ "learning_rate": 1.2606060606060607e-05,
647
+ "loss": 0.779,
648
+ "num_tokens": 7594818.0,
649
+ "step": 80
650
+ },
651
+ {
652
+ "epoch": 0.8804347826086957,
653
+ "grad_norm": 0.6742743459752469,
654
+ "learning_rate": 1.2484848484848487e-05,
655
+ "loss": 0.7664,
656
+ "num_tokens": 7687306.0,
657
+ "step": 81
658
+ },
659
+ {
660
+ "epoch": 0.8913043478260869,
661
+ "grad_norm": 0.6788480168543541,
662
+ "learning_rate": 1.2363636363636364e-05,
663
+ "loss": 0.7481,
664
+ "num_tokens": 7787926.0,
665
+ "step": 82
666
+ },
667
+ {
668
+ "epoch": 0.9021739130434783,
669
+ "grad_norm": 0.7159837226779882,
670
+ "learning_rate": 1.2242424242424242e-05,
671
+ "loss": 0.7658,
672
+ "num_tokens": 7886472.0,
673
+ "step": 83
674
+ },
675
+ {
676
+ "epoch": 0.9130434782608695,
677
+ "grad_norm": 0.6843998675172349,
678
+ "learning_rate": 1.2121212121212122e-05,
679
+ "loss": 0.8125,
680
+ "num_tokens": 7979894.0,
681
+ "step": 84
682
+ },
683
+ {
684
+ "epoch": 0.9239130434782609,
685
+ "grad_norm": 0.7299584101150774,
686
+ "learning_rate": 1.2e-05,
687
+ "loss": 0.7764,
688
+ "num_tokens": 8072266.0,
689
+ "step": 85
690
+ },
691
+ {
692
+ "epoch": 0.9347826086956522,
693
+ "grad_norm": 0.6972344562839721,
694
+ "learning_rate": 1.187878787878788e-05,
695
+ "loss": 0.7486,
696
+ "num_tokens": 8169019.0,
697
+ "step": 86
698
+ },
699
+ {
700
+ "epoch": 0.9456521739130435,
701
+ "grad_norm": 0.695628203135494,
702
+ "learning_rate": 1.1757575757575759e-05,
703
+ "loss": 0.7216,
704
+ "num_tokens": 8263517.0,
705
+ "step": 87
706
+ },
707
+ {
708
+ "epoch": 0.9565217391304348,
709
+ "grad_norm": 0.7199580320946095,
710
+ "learning_rate": 1.1636363636363637e-05,
711
+ "loss": 0.753,
712
+ "num_tokens": 8354397.0,
713
+ "step": 88
714
+ },
715
+ {
716
+ "epoch": 0.967391304347826,
717
+ "grad_norm": 0.688593706715687,
718
+ "learning_rate": 1.1515151515151517e-05,
719
+ "loss": 0.7736,
720
+ "num_tokens": 8457076.0,
721
+ "step": 89
722
+ },
723
+ {
724
+ "epoch": 0.9782608695652174,
725
+ "grad_norm": 0.7039533145498842,
726
+ "learning_rate": 1.1393939393939395e-05,
727
+ "loss": 0.747,
728
+ "num_tokens": 8543392.0,
729
+ "step": 90
730
+ },
731
+ {
732
+ "epoch": 0.9891304347826086,
733
+ "grad_norm": 0.7417219845924928,
734
+ "learning_rate": 1.1272727272727272e-05,
735
+ "loss": 0.8096,
736
+ "num_tokens": 8634673.0,
737
+ "step": 91
738
+ },
739
+ {
740
+ "epoch": 1.0,
741
+ "grad_norm": 0.716787642824306,
742
+ "learning_rate": 1.1151515151515154e-05,
743
+ "loss": 0.7479,
744
+ "num_tokens": 8737000.0,
745
+ "step": 92
746
+ },
747
+ {
748
+ "epoch": 1.0108695652173914,
749
+ "grad_norm": 0.9557939792897692,
750
+ "learning_rate": 1.103030303030303e-05,
751
+ "loss": 0.5859,
752
+ "num_tokens": 8841242.0,
753
+ "step": 93
754
+ },
755
+ {
756
+ "epoch": 1.0217391304347827,
757
+ "grad_norm": 0.9640008403759153,
758
+ "learning_rate": 1.0909090909090909e-05,
759
+ "loss": 0.5499,
760
+ "num_tokens": 8939461.0,
761
+ "step": 94
762
+ },
763
+ {
764
+ "epoch": 1.0326086956521738,
765
+ "grad_norm": 0.8358505468758168,
766
+ "learning_rate": 1.0787878787878789e-05,
767
+ "loss": 0.5827,
768
+ "num_tokens": 9041498.0,
769
+ "step": 95
770
+ },
771
+ {
772
+ "epoch": 1.0434782608695652,
773
+ "grad_norm": 0.8333101189839462,
774
+ "learning_rate": 1.0666666666666667e-05,
775
+ "loss": 0.5649,
776
+ "num_tokens": 9135657.0,
777
+ "step": 96
778
+ },
779
+ {
780
+ "epoch": 1.0543478260869565,
781
+ "grad_norm": 0.8267054342321523,
782
+ "learning_rate": 1.0545454545454546e-05,
783
+ "loss": 0.4877,
784
+ "num_tokens": 9223861.0,
785
+ "step": 97
786
+ },
787
+ {
788
+ "epoch": 1.065217391304348,
789
+ "grad_norm": 0.9083108087266245,
790
+ "learning_rate": 1.0424242424242426e-05,
791
+ "loss": 0.5178,
792
+ "num_tokens": 9306683.0,
793
+ "step": 98
794
+ },
795
+ {
796
+ "epoch": 1.0760869565217392,
797
+ "grad_norm": 1.0509887548129675,
798
+ "learning_rate": 1.0303030303030304e-05,
799
+ "loss": 0.5147,
800
+ "num_tokens": 9403981.0,
801
+ "step": 99
802
+ },
803
+ {
804
+ "epoch": 1.0869565217391304,
805
+ "grad_norm": 0.9617465165511758,
806
+ "learning_rate": 1.0181818181818182e-05,
807
+ "loss": 0.5772,
808
+ "num_tokens": 9510065.0,
809
+ "step": 100
810
+ },
811
+ {
812
+ "epoch": 1.0978260869565217,
813
+ "grad_norm": 0.8289148974326247,
814
+ "learning_rate": 1.0060606060606062e-05,
815
+ "loss": 0.4808,
816
+ "num_tokens": 9594692.0,
817
+ "step": 101
818
+ },
819
+ {
820
+ "epoch": 1.108695652173913,
821
+ "grad_norm": 0.7858587143009836,
822
+ "learning_rate": 9.939393939393939e-06,
823
+ "loss": 0.4966,
824
+ "num_tokens": 9686151.0,
825
+ "step": 102
826
+ },
827
+ {
828
+ "epoch": 1.1195652173913044,
829
+ "grad_norm": 0.7571756541563724,
830
+ "learning_rate": 9.81818181818182e-06,
831
+ "loss": 0.5566,
832
+ "num_tokens": 9785432.0,
833
+ "step": 103
834
+ },
835
+ {
836
+ "epoch": 1.1304347826086956,
837
+ "grad_norm": 0.7871226692453429,
838
+ "learning_rate": 9.696969696969698e-06,
839
+ "loss": 0.5247,
840
+ "num_tokens": 9872855.0,
841
+ "step": 104
842
+ },
843
+ {
844
+ "epoch": 1.141304347826087,
845
+ "grad_norm": 0.7536149334148949,
846
+ "learning_rate": 9.575757575757576e-06,
847
+ "loss": 0.5696,
848
+ "num_tokens": 9966227.0,
849
+ "step": 105
850
+ },
851
+ {
852
+ "epoch": 1.1521739130434783,
853
+ "grad_norm": 0.7674793225688019,
854
+ "learning_rate": 9.454545454545456e-06,
855
+ "loss": 0.5085,
856
+ "num_tokens": 10066104.0,
857
+ "step": 106
858
+ },
859
+ {
860
+ "epoch": 1.1630434782608696,
861
+ "grad_norm": 0.6536542113768675,
862
+ "learning_rate": 9.333333333333334e-06,
863
+ "loss": 0.4929,
864
+ "num_tokens": 10163479.0,
865
+ "step": 107
866
+ },
867
+ {
868
+ "epoch": 1.1739130434782608,
869
+ "grad_norm": 0.705223538854848,
870
+ "learning_rate": 9.212121212121213e-06,
871
+ "loss": 0.5041,
872
+ "num_tokens": 10256103.0,
873
+ "step": 108
874
+ },
875
+ {
876
+ "epoch": 1.184782608695652,
877
+ "grad_norm": 0.7303409820035304,
878
+ "learning_rate": 9.090909090909091e-06,
879
+ "loss": 0.5068,
880
+ "num_tokens": 10346350.0,
881
+ "step": 109
882
+ },
883
+ {
884
+ "epoch": 1.1956521739130435,
885
+ "grad_norm": 0.669830511929235,
886
+ "learning_rate": 8.969696969696971e-06,
887
+ "loss": 0.5058,
888
+ "num_tokens": 10446545.0,
889
+ "step": 110
890
+ },
891
+ {
892
+ "epoch": 1.2065217391304348,
893
+ "grad_norm": 0.6616180902943235,
894
+ "learning_rate": 8.84848484848485e-06,
895
+ "loss": 0.5212,
896
+ "num_tokens": 10543987.0,
897
+ "step": 111
898
+ },
899
+ {
900
+ "epoch": 1.2173913043478262,
901
+ "grad_norm": 0.7140780588825608,
902
+ "learning_rate": 8.727272727272728e-06,
903
+ "loss": 0.5542,
904
+ "num_tokens": 10637982.0,
905
+ "step": 112
906
+ },
907
+ {
908
+ "epoch": 1.2282608695652173,
909
+ "grad_norm": 0.6797454711812516,
910
+ "learning_rate": 8.606060606060606e-06,
911
+ "loss": 0.5146,
912
+ "num_tokens": 10736295.0,
913
+ "step": 113
914
+ },
915
+ {
916
+ "epoch": 1.2391304347826086,
917
+ "grad_norm": 0.6970790226374188,
918
+ "learning_rate": 8.484848484848486e-06,
919
+ "loss": 0.4858,
920
+ "num_tokens": 10838089.0,
921
+ "step": 114
922
+ },
923
+ {
924
+ "epoch": 1.25,
925
+ "grad_norm": 0.7455739079700027,
926
+ "learning_rate": 8.363636363636365e-06,
927
+ "loss": 0.4863,
928
+ "num_tokens": 10924837.0,
929
+ "step": 115
930
+ },
931
+ {
932
+ "epoch": 1.2608695652173914,
933
+ "grad_norm": 0.6801436530929313,
934
+ "learning_rate": 8.242424242424243e-06,
935
+ "loss": 0.5271,
936
+ "num_tokens": 11021046.0,
937
+ "step": 116
938
+ },
939
+ {
940
+ "epoch": 1.2717391304347827,
941
+ "grad_norm": 0.6727217360986417,
942
+ "learning_rate": 8.121212121212121e-06,
943
+ "loss": 0.5162,
944
+ "num_tokens": 11117240.0,
945
+ "step": 117
946
+ },
947
+ {
948
+ "epoch": 1.2826086956521738,
949
+ "grad_norm": 0.7023642689024054,
950
+ "learning_rate": 8.000000000000001e-06,
951
+ "loss": 0.4953,
952
+ "num_tokens": 11210211.0,
953
+ "step": 118
954
+ },
955
+ {
956
+ "epoch": 1.2934782608695652,
957
+ "grad_norm": 0.6447357115502464,
958
+ "learning_rate": 7.87878787878788e-06,
959
+ "loss": 0.5144,
960
+ "num_tokens": 11303329.0,
961
+ "step": 119
962
+ },
963
+ {
964
+ "epoch": 1.3043478260869565,
965
+ "grad_norm": 0.6492607932024856,
966
+ "learning_rate": 7.757575757575758e-06,
967
+ "loss": 0.5034,
968
+ "num_tokens": 11394106.0,
969
+ "step": 120
970
+ },
971
+ {
972
+ "epoch": 1.315217391304348,
973
+ "grad_norm": 0.6515181308514346,
974
+ "learning_rate": 7.636363636363638e-06,
975
+ "loss": 0.535,
976
+ "num_tokens": 11490446.0,
977
+ "step": 121
978
+ },
979
+ {
980
+ "epoch": 1.3260869565217392,
981
+ "grad_norm": 0.672198656598516,
982
+ "learning_rate": 7.515151515151516e-06,
983
+ "loss": 0.5065,
984
+ "num_tokens": 11581978.0,
985
+ "step": 122
986
+ },
987
+ {
988
+ "epoch": 1.3369565217391304,
989
+ "grad_norm": 0.6741993377594862,
990
+ "learning_rate": 7.393939393939395e-06,
991
+ "loss": 0.4874,
992
+ "num_tokens": 11665181.0,
993
+ "step": 123
994
+ },
995
+ {
996
+ "epoch": 1.3478260869565217,
997
+ "grad_norm": 0.6125558254135477,
998
+ "learning_rate": 7.272727272727273e-06,
999
+ "loss": 0.5178,
1000
+ "num_tokens": 11764989.0,
1001
+ "step": 124
1002
+ },
1003
+ {
1004
+ "epoch": 1.358695652173913,
1005
+ "grad_norm": 0.6452975872864739,
1006
+ "learning_rate": 7.151515151515152e-06,
1007
+ "loss": 0.5048,
1008
+ "num_tokens": 11860837.0,
1009
+ "step": 125
1010
+ },
1011
+ {
1012
+ "epoch": 1.3695652173913042,
1013
+ "grad_norm": 0.6942100514879066,
1014
+ "learning_rate": 7.030303030303031e-06,
1015
+ "loss": 0.5337,
1016
+ "num_tokens": 11949817.0,
1017
+ "step": 126
1018
+ },
1019
+ {
1020
+ "epoch": 1.3804347826086958,
1021
+ "grad_norm": 0.6803177844769924,
1022
+ "learning_rate": 6.90909090909091e-06,
1023
+ "loss": 0.5114,
1024
+ "num_tokens": 12046147.0,
1025
+ "step": 127
1026
+ },
1027
+ {
1028
+ "epoch": 1.391304347826087,
1029
+ "grad_norm": 0.6485232956728539,
1030
+ "learning_rate": 6.787878787878789e-06,
1031
+ "loss": 0.5136,
1032
+ "num_tokens": 12155046.0,
1033
+ "step": 128
1034
+ },
1035
+ {
1036
+ "epoch": 1.4021739130434783,
1037
+ "grad_norm": 0.6399901029660079,
1038
+ "learning_rate": 6.666666666666667e-06,
1039
+ "loss": 0.4899,
1040
+ "num_tokens": 12245026.0,
1041
+ "step": 129
1042
+ },
1043
+ {
1044
+ "epoch": 1.4130434782608696,
1045
+ "grad_norm": 0.6778428524940225,
1046
+ "learning_rate": 6.545454545454546e-06,
1047
+ "loss": 0.5007,
1048
+ "num_tokens": 12339918.0,
1049
+ "step": 130
1050
+ },
1051
+ {
1052
+ "epoch": 1.4239130434782608,
1053
+ "grad_norm": 0.6506595090929513,
1054
+ "learning_rate": 6.424242424242425e-06,
1055
+ "loss": 0.5123,
1056
+ "num_tokens": 12431057.0,
1057
+ "step": 131
1058
+ },
1059
+ {
1060
+ "epoch": 1.434782608695652,
1061
+ "grad_norm": 0.6042493738676704,
1062
+ "learning_rate": 6.303030303030303e-06,
1063
+ "loss": 0.5028,
1064
+ "num_tokens": 12528957.0,
1065
+ "step": 132
1066
+ },
1067
+ {
1068
+ "epoch": 1.4456521739130435,
1069
+ "grad_norm": 0.6655662091714003,
1070
+ "learning_rate": 6.181818181818182e-06,
1071
+ "loss": 0.5159,
1072
+ "num_tokens": 12630846.0,
1073
+ "step": 133
1074
+ },
1075
+ {
1076
+ "epoch": 1.4565217391304348,
1077
+ "grad_norm": 0.6581649207568613,
1078
+ "learning_rate": 6.060606060606061e-06,
1079
+ "loss": 0.5468,
1080
+ "num_tokens": 12734120.0,
1081
+ "step": 134
1082
+ },
1083
+ {
1084
+ "epoch": 1.4673913043478262,
1085
+ "grad_norm": 0.6792172891456207,
1086
+ "learning_rate": 5.93939393939394e-06,
1087
+ "loss": 0.515,
1088
+ "num_tokens": 12825758.0,
1089
+ "step": 135
1090
+ },
1091
+ {
1092
+ "epoch": 1.4782608695652173,
1093
+ "grad_norm": 0.6486207741620927,
1094
+ "learning_rate": 5.8181818181818185e-06,
1095
+ "loss": 0.4768,
1096
+ "num_tokens": 12915706.0,
1097
+ "step": 136
1098
+ },
1099
+ {
1100
+ "epoch": 1.4891304347826086,
1101
+ "grad_norm": 0.6229517910332318,
1102
+ "learning_rate": 5.696969696969698e-06,
1103
+ "loss": 0.5253,
1104
+ "num_tokens": 13006745.0,
1105
+ "step": 137
1106
+ },
1107
+ {
1108
+ "epoch": 1.5,
1109
+ "grad_norm": 0.6931899962995233,
1110
+ "learning_rate": 5.575757575757577e-06,
1111
+ "loss": 0.518,
1112
+ "num_tokens": 13094837.0,
1113
+ "step": 138
1114
+ },
1115
+ {
1116
+ "epoch": 1.5108695652173914,
1117
+ "grad_norm": 0.6790435623873506,
1118
+ "learning_rate": 5.4545454545454545e-06,
1119
+ "loss": 0.5225,
1120
+ "num_tokens": 13186663.0,
1121
+ "step": 139
1122
+ },
1123
+ {
1124
+ "epoch": 1.5217391304347827,
1125
+ "grad_norm": 0.6370361345960944,
1126
+ "learning_rate": 5.333333333333334e-06,
1127
+ "loss": 0.5281,
1128
+ "num_tokens": 13275324.0,
1129
+ "step": 140
1130
+ },
1131
+ {
1132
+ "epoch": 1.5326086956521738,
1133
+ "grad_norm": 0.6988457221148147,
1134
+ "learning_rate": 5.212121212121213e-06,
1135
+ "loss": 0.4471,
1136
+ "num_tokens": 13354804.0,
1137
+ "step": 141
1138
+ },
1139
+ {
1140
+ "epoch": 1.5434782608695652,
1141
+ "grad_norm": 0.6267926090215254,
1142
+ "learning_rate": 5.090909090909091e-06,
1143
+ "loss": 0.5081,
1144
+ "num_tokens": 13459278.0,
1145
+ "step": 142
1146
+ },
1147
+ {
1148
+ "epoch": 1.5543478260869565,
1149
+ "grad_norm": 0.6034410178549162,
1150
+ "learning_rate": 4.9696969696969696e-06,
1151
+ "loss": 0.4722,
1152
+ "num_tokens": 13549338.0,
1153
+ "step": 143
1154
+ },
1155
+ {
1156
+ "epoch": 1.5652173913043477,
1157
+ "grad_norm": 0.7074387736221774,
1158
+ "learning_rate": 4.848484848484849e-06,
1159
+ "loss": 0.5115,
1160
+ "num_tokens": 13642023.0,
1161
+ "step": 144
1162
+ },
1163
+ {
1164
+ "epoch": 1.5760869565217392,
1165
+ "grad_norm": 0.625505288129055,
1166
+ "learning_rate": 4.727272727272728e-06,
1167
+ "loss": 0.4854,
1168
+ "num_tokens": 13734577.0,
1169
+ "step": 145
1170
+ },
1171
+ {
1172
+ "epoch": 1.5869565217391304,
1173
+ "grad_norm": 0.6337610272860343,
1174
+ "learning_rate": 4.606060606060606e-06,
1175
+ "loss": 0.5218,
1176
+ "num_tokens": 13827853.0,
1177
+ "step": 146
1178
+ },
1179
+ {
1180
+ "epoch": 1.5978260869565217,
1181
+ "grad_norm": 0.5960020890354443,
1182
+ "learning_rate": 4.4848484848484855e-06,
1183
+ "loss": 0.4842,
1184
+ "num_tokens": 13926421.0,
1185
+ "step": 147
1186
+ },
1187
+ {
1188
+ "epoch": 1.608695652173913,
1189
+ "grad_norm": 0.6099226198445001,
1190
+ "learning_rate": 4.363636363636364e-06,
1191
+ "loss": 0.5045,
1192
+ "num_tokens": 14027160.0,
1193
+ "step": 148
1194
+ },
1195
+ {
1196
+ "epoch": 1.6195652173913042,
1197
+ "grad_norm": 0.6403047778481467,
1198
+ "learning_rate": 4.242424242424243e-06,
1199
+ "loss": 0.5112,
1200
+ "num_tokens": 14123445.0,
1201
+ "step": 149
1202
+ },
1203
+ {
1204
+ "epoch": 1.6304347826086958,
1205
+ "grad_norm": 0.6428678963084046,
1206
+ "learning_rate": 4.1212121212121215e-06,
1207
+ "loss": 0.4792,
1208
+ "num_tokens": 14213841.0,
1209
+ "step": 150
1210
+ },
1211
+ {
1212
+ "epoch": 1.641304347826087,
1213
+ "grad_norm": 0.682404747047711,
1214
+ "learning_rate": 4.000000000000001e-06,
1215
+ "loss": 0.4825,
1216
+ "num_tokens": 14299391.0,
1217
+ "step": 151
1218
+ },
1219
+ {
1220
+ "epoch": 1.6521739130434783,
1221
+ "grad_norm": 0.6392379137193949,
1222
+ "learning_rate": 3.878787878787879e-06,
1223
+ "loss": 0.5237,
1224
+ "num_tokens": 14406803.0,
1225
+ "step": 152
1226
+ },
1227
+ {
1228
+ "epoch": 1.6630434782608696,
1229
+ "grad_norm": 0.6336927405705557,
1230
+ "learning_rate": 3.757575757575758e-06,
1231
+ "loss": 0.4933,
1232
+ "num_tokens": 14495074.0,
1233
+ "step": 153
1234
+ },
1235
+ {
1236
+ "epoch": 1.6739130434782608,
1237
+ "grad_norm": 0.613723947203496,
1238
+ "learning_rate": 3.6363636363636366e-06,
1239
+ "loss": 0.4987,
1240
+ "num_tokens": 14583035.0,
1241
+ "step": 154
1242
+ },
1243
+ {
1244
+ "epoch": 1.6847826086956523,
1245
+ "grad_norm": 0.6454282865195516,
1246
+ "learning_rate": 3.5151515151515154e-06,
1247
+ "loss": 0.4844,
1248
+ "num_tokens": 14675739.0,
1249
+ "step": 155
1250
+ },
1251
+ {
1252
+ "epoch": 1.6956521739130435,
1253
+ "grad_norm": 0.5744835923909174,
1254
+ "learning_rate": 3.3939393939393946e-06,
1255
+ "loss": 0.5221,
1256
+ "num_tokens": 14771295.0,
1257
+ "step": 156
1258
+ },
1259
+ {
1260
+ "epoch": 1.7065217391304348,
1261
+ "grad_norm": 0.6513316019714479,
1262
+ "learning_rate": 3.272727272727273e-06,
1263
+ "loss": 0.5196,
1264
+ "num_tokens": 14868828.0,
1265
+ "step": 157
1266
+ },
1267
+ {
1268
+ "epoch": 1.7173913043478262,
1269
+ "grad_norm": 0.6051344138316549,
1270
+ "learning_rate": 3.1515151515151517e-06,
1271
+ "loss": 0.5399,
1272
+ "num_tokens": 14962321.0,
1273
+ "step": 158
1274
+ },
1275
+ {
1276
+ "epoch": 1.7282608695652173,
1277
+ "grad_norm": 0.5997374654265664,
1278
+ "learning_rate": 3.0303030303030305e-06,
1279
+ "loss": 0.566,
1280
+ "num_tokens": 15071474.0,
1281
+ "step": 159
1282
+ },
1283
+ {
1284
+ "epoch": 1.7391304347826086,
1285
+ "grad_norm": 0.6172378822692283,
1286
+ "learning_rate": 2.9090909090909093e-06,
1287
+ "loss": 0.5205,
1288
+ "num_tokens": 15172326.0,
1289
+ "step": 160
1290
+ },
1291
+ {
1292
+ "epoch": 1.75,
1293
+ "grad_norm": 0.6790771644844449,
1294
+ "learning_rate": 2.7878787878787885e-06,
1295
+ "loss": 0.5405,
1296
+ "num_tokens": 15263118.0,
1297
+ "step": 161
1298
+ },
1299
+ {
1300
+ "epoch": 1.7608695652173914,
1301
+ "grad_norm": 0.6250126730731829,
1302
+ "learning_rate": 2.666666666666667e-06,
1303
+ "loss": 0.5358,
1304
+ "num_tokens": 15366482.0,
1305
+ "step": 162
1306
+ },
1307
+ {
1308
+ "epoch": 1.7717391304347827,
1309
+ "grad_norm": 0.616414057408985,
1310
+ "learning_rate": 2.5454545454545456e-06,
1311
+ "loss": 0.5124,
1312
+ "num_tokens": 15466519.0,
1313
+ "step": 163
1314
+ },
1315
+ {
1316
+ "epoch": 1.7826086956521738,
1317
+ "grad_norm": 0.6196261922311431,
1318
+ "learning_rate": 2.4242424242424244e-06,
1319
+ "loss": 0.5011,
1320
+ "num_tokens": 15568813.0,
1321
+ "step": 164
1322
+ },
1323
+ {
1324
+ "epoch": 1.7934782608695652,
1325
+ "grad_norm": 0.6012492351790968,
1326
+ "learning_rate": 2.303030303030303e-06,
1327
+ "loss": 0.4866,
1328
+ "num_tokens": 15663551.0,
1329
+ "step": 165
1330
+ },
1331
+ {
1332
+ "epoch": 1.8043478260869565,
1333
+ "grad_norm": 0.6093660563249755,
1334
+ "learning_rate": 2.181818181818182e-06,
1335
+ "loss": 0.4806,
1336
+ "num_tokens": 15751970.0,
1337
+ "step": 166
1338
+ },
1339
+ {
1340
+ "epoch": 1.8152173913043477,
1341
+ "grad_norm": 0.5733134815640453,
1342
+ "learning_rate": 2.0606060606060607e-06,
1343
+ "loss": 0.5342,
1344
+ "num_tokens": 15851852.0,
1345
+ "step": 167
1346
+ },
1347
+ {
1348
+ "epoch": 1.8260869565217392,
1349
+ "grad_norm": 0.5629607438246448,
1350
+ "learning_rate": 1.9393939393939395e-06,
1351
+ "loss": 0.4899,
1352
+ "num_tokens": 15953339.0,
1353
+ "step": 168
1354
+ },
1355
+ {
1356
+ "epoch": 1.8369565217391304,
1357
+ "grad_norm": 0.5705735862059056,
1358
+ "learning_rate": 1.8181818181818183e-06,
1359
+ "loss": 0.5279,
1360
+ "num_tokens": 16049218.0,
1361
+ "step": 169
1362
+ },
1363
+ {
1364
+ "epoch": 1.8478260869565217,
1365
+ "grad_norm": 0.6346619776519064,
1366
+ "learning_rate": 1.6969696969696973e-06,
1367
+ "loss": 0.5129,
1368
+ "num_tokens": 16148821.0,
1369
+ "step": 170
1370
+ },
1371
+ {
1372
+ "epoch": 1.858695652173913,
1373
+ "grad_norm": 0.6442961172463525,
1374
+ "learning_rate": 1.5757575757575759e-06,
1375
+ "loss": 0.4936,
1376
+ "num_tokens": 16236603.0,
1377
+ "step": 171
1378
+ },
1379
+ {
1380
+ "epoch": 1.8695652173913042,
1381
+ "grad_norm": 0.6198075225575228,
1382
+ "learning_rate": 1.4545454545454546e-06,
1383
+ "loss": 0.5051,
1384
+ "num_tokens": 16331317.0,
1385
+ "step": 172
1386
+ },
1387
+ {
1388
+ "epoch": 1.8804347826086958,
1389
+ "grad_norm": 0.5952174959924715,
1390
+ "learning_rate": 1.3333333333333334e-06,
1391
+ "loss": 0.5081,
1392
+ "num_tokens": 16435421.0,
1393
+ "step": 173
1394
+ },
1395
+ {
1396
+ "epoch": 1.891304347826087,
1397
+ "grad_norm": 0.608740415655138,
1398
+ "learning_rate": 1.2121212121212122e-06,
1399
+ "loss": 0.4985,
1400
+ "num_tokens": 16525942.0,
1401
+ "step": 174
1402
+ },
1403
+ {
1404
+ "epoch": 1.9021739130434783,
1405
+ "grad_norm": 0.6186344271280536,
1406
+ "learning_rate": 1.090909090909091e-06,
1407
+ "loss": 0.4966,
1408
+ "num_tokens": 16612253.0,
1409
+ "step": 175
1410
+ },
1411
+ {
1412
+ "epoch": 1.9130434782608696,
1413
+ "grad_norm": 0.6128028556805487,
1414
+ "learning_rate": 9.696969696969698e-07,
1415
+ "loss": 0.4677,
1416
+ "num_tokens": 16695763.0,
1417
+ "step": 176
1418
+ },
1419
+ {
1420
+ "epoch": 1.9239130434782608,
1421
+ "grad_norm": 0.6186139521919142,
1422
+ "learning_rate": 8.484848484848486e-07,
1423
+ "loss": 0.5063,
1424
+ "num_tokens": 16792394.0,
1425
+ "step": 177
1426
+ },
1427
+ {
1428
+ "epoch": 1.9347826086956523,
1429
+ "grad_norm": 0.621118150443857,
1430
+ "learning_rate": 7.272727272727273e-07,
1431
+ "loss": 0.4859,
1432
+ "num_tokens": 16887893.0,
1433
+ "step": 178
1434
+ },
1435
+ {
1436
+ "epoch": 1.9456521739130435,
1437
+ "grad_norm": 0.63455296339652,
1438
+ "learning_rate": 6.060606060606061e-07,
1439
+ "loss": 0.5323,
1440
+ "num_tokens": 16978736.0,
1441
+ "step": 179
1442
+ },
1443
+ {
1444
+ "epoch": 1.9565217391304348,
1445
+ "grad_norm": 0.6338215890082622,
1446
+ "learning_rate": 4.848484848484849e-07,
1447
+ "loss": 0.5063,
1448
+ "num_tokens": 17069163.0,
1449
+ "step": 180
1450
+ },
1451
+ {
1452
+ "epoch": 1.9673913043478262,
1453
+ "grad_norm": 0.6122769745533587,
1454
+ "learning_rate": 3.6363636363636366e-07,
1455
+ "loss": 0.5523,
1456
+ "num_tokens": 17170696.0,
1457
+ "step": 181
1458
+ },
1459
+ {
1460
+ "epoch": 1.9782608695652173,
1461
+ "grad_norm": 0.5640059310251839,
1462
+ "learning_rate": 2.4242424242424244e-07,
1463
+ "loss": 0.5126,
1464
+ "num_tokens": 17276399.0,
1465
+ "step": 182
1466
+ },
1467
+ {
1468
+ "epoch": 1.9891304347826086,
1469
+ "grad_norm": 0.67719515967601,
1470
+ "learning_rate": 1.2121212121212122e-07,
1471
+ "loss": 0.549,
1472
+ "num_tokens": 17377036.0,
1473
+ "step": 183
1474
+ },
1475
+ {
1476
+ "epoch": 2.0,
1477
+ "grad_norm": 0.5303896052463619,
1478
+ "learning_rate": 0.0,
1479
+ "loss": 0.476,
1480
+ "num_tokens": 17486096.0,
1481
+ "step": 184
1482
+ }
1483
+ ],
1484
+ "logging_steps": 1,
1485
+ "max_steps": 184,
1486
+ "num_input_tokens_seen": 0,
1487
+ "num_train_epochs": 2,
1488
+ "save_steps": 500,
1489
+ "stateful_callbacks": {
1490
+ "TrainerControl": {
1491
+ "args": {
1492
+ "should_epoch_stop": false,
1493
+ "should_evaluate": false,
1494
+ "should_log": false,
1495
+ "should_save": true,
1496
+ "should_training_stop": true
1497
+ },
1498
+ "attributes": {}
1499
+ }
1500
+ },
1501
+ "total_flos": 198078707531776.0,
1502
+ "train_batch_size": 4,
1503
+ "trial_name": null,
1504
+ "trial_params": null
1505
+ }