ColorfulAI commited on
Commit
7b693ed
·
1 Parent(s): 65797a2
README.md CHANGED
@@ -1,3 +1,18 @@
1
  ---
2
  license: mit
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
  ---
4
+
5
+ # M4-Auio-7B
6
+
7
+ Enhancing Interactive Capabilities in VideoLLM
8
+
9
+ M4-Audio-7B is an extension of [LongVA-7B](https://github.com/EvolvingLMMs-Lab/LongVA), further trained using the [M4-IT](https://huggingface.co/datasets/ColorfulAI/M4-IT) dataset, which comprises 9,963 visual-audio instruction tuning instances. This training was conducted without any special modifications to the existing training pipeline.
10
+
11
+
12
+ ## Usage
13
+
14
+ ![images](./assets/framework.png)
15
+
16
+
17
+ For more information about the interaction inference pipeline, please visit the [M4 GitHub repository](https://github.com/patrick-tssn/M4).
18
+
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
config.json ADDED
@@ -0,0 +1,861 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "checkpoints/longva7b-qwen2-voiceassistant",
3
+ "architectures": [
4
+ "LlavaQwenForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 3584,
11
+ "image_aspect_ratio": "anyres",
12
+ "image_crop_resolution": null,
13
+ "image_grid_pinpoints": [
14
+ [
15
+ 336,
16
+ 672
17
+ ],
18
+ [
19
+ 336,
20
+ 1008
21
+ ],
22
+ [
23
+ 336,
24
+ 1344
25
+ ],
26
+ [
27
+ 336,
28
+ 1680
29
+ ],
30
+ [
31
+ 336,
32
+ 2016
33
+ ],
34
+ [
35
+ 336,
36
+ 2352
37
+ ],
38
+ [
39
+ 336,
40
+ 2688
41
+ ],
42
+ [
43
+ 336,
44
+ 3024
45
+ ],
46
+ [
47
+ 336,
48
+ 3360
49
+ ],
50
+ [
51
+ 336,
52
+ 3696
53
+ ],
54
+ [
55
+ 336,
56
+ 4032
57
+ ],
58
+ [
59
+ 336,
60
+ 4368
61
+ ],
62
+ [
63
+ 336,
64
+ 4704
65
+ ],
66
+ [
67
+ 336,
68
+ 5040
69
+ ],
70
+ [
71
+ 336,
72
+ 5376
73
+ ],
74
+ [
75
+ 336,
76
+ 5712
77
+ ],
78
+ [
79
+ 336,
80
+ 6048
81
+ ],
82
+ [
83
+ 336,
84
+ 6384
85
+ ],
86
+ [
87
+ 336,
88
+ 6720
89
+ ],
90
+ [
91
+ 336,
92
+ 7056
93
+ ],
94
+ [
95
+ 336,
96
+ 7392
97
+ ],
98
+ [
99
+ 336,
100
+ 7728
101
+ ],
102
+ [
103
+ 336,
104
+ 8064
105
+ ],
106
+ [
107
+ 336,
108
+ 8400
109
+ ],
110
+ [
111
+ 336,
112
+ 8736
113
+ ],
114
+ [
115
+ 336,
116
+ 9072
117
+ ],
118
+ [
119
+ 336,
120
+ 9408
121
+ ],
122
+ [
123
+ 336,
124
+ 9744
125
+ ],
126
+ [
127
+ 336,
128
+ 10080
129
+ ],
130
+ [
131
+ 336,
132
+ 10416
133
+ ],
134
+ [
135
+ 336,
136
+ 10752
137
+ ],
138
+ [
139
+ 336,
140
+ 11088
141
+ ],
142
+ [
143
+ 336,
144
+ 11424
145
+ ],
146
+ [
147
+ 336,
148
+ 11760
149
+ ],
150
+ [
151
+ 336,
152
+ 12096
153
+ ],
154
+ [
155
+ 336,
156
+ 12432
157
+ ],
158
+ [
159
+ 336,
160
+ 12768
161
+ ],
162
+ [
163
+ 336,
164
+ 13104
165
+ ],
166
+ [
167
+ 336,
168
+ 13440
169
+ ],
170
+ [
171
+ 336,
172
+ 13776
173
+ ],
174
+ [
175
+ 336,
176
+ 14112
177
+ ],
178
+ [
179
+ 336,
180
+ 14448
181
+ ],
182
+ [
183
+ 336,
184
+ 14784
185
+ ],
186
+ [
187
+ 336,
188
+ 15120
189
+ ],
190
+ [
191
+ 336,
192
+ 15456
193
+ ],
194
+ [
195
+ 336,
196
+ 15792
197
+ ],
198
+ [
199
+ 336,
200
+ 16128
201
+ ],
202
+ [
203
+ 336,
204
+ 16464
205
+ ],
206
+ [
207
+ 672,
208
+ 336
209
+ ],
210
+ [
211
+ 672,
212
+ 672
213
+ ],
214
+ [
215
+ 672,
216
+ 1008
217
+ ],
218
+ [
219
+ 672,
220
+ 1344
221
+ ],
222
+ [
223
+ 672,
224
+ 1680
225
+ ],
226
+ [
227
+ 672,
228
+ 2016
229
+ ],
230
+ [
231
+ 672,
232
+ 2352
233
+ ],
234
+ [
235
+ 672,
236
+ 2688
237
+ ],
238
+ [
239
+ 672,
240
+ 3024
241
+ ],
242
+ [
243
+ 672,
244
+ 3360
245
+ ],
246
+ [
247
+ 672,
248
+ 3696
249
+ ],
250
+ [
251
+ 672,
252
+ 4032
253
+ ],
254
+ [
255
+ 672,
256
+ 4368
257
+ ],
258
+ [
259
+ 672,
260
+ 4704
261
+ ],
262
+ [
263
+ 672,
264
+ 5040
265
+ ],
266
+ [
267
+ 672,
268
+ 5376
269
+ ],
270
+ [
271
+ 672,
272
+ 5712
273
+ ],
274
+ [
275
+ 672,
276
+ 6048
277
+ ],
278
+ [
279
+ 672,
280
+ 6384
281
+ ],
282
+ [
283
+ 672,
284
+ 6720
285
+ ],
286
+ [
287
+ 672,
288
+ 7056
289
+ ],
290
+ [
291
+ 672,
292
+ 7392
293
+ ],
294
+ [
295
+ 672,
296
+ 7728
297
+ ],
298
+ [
299
+ 672,
300
+ 8064
301
+ ],
302
+ [
303
+ 1008,
304
+ 336
305
+ ],
306
+ [
307
+ 1008,
308
+ 672
309
+ ],
310
+ [
311
+ 1008,
312
+ 1008
313
+ ],
314
+ [
315
+ 1008,
316
+ 1344
317
+ ],
318
+ [
319
+ 1008,
320
+ 1680
321
+ ],
322
+ [
323
+ 1008,
324
+ 2016
325
+ ],
326
+ [
327
+ 1008,
328
+ 2352
329
+ ],
330
+ [
331
+ 1008,
332
+ 2688
333
+ ],
334
+ [
335
+ 1008,
336
+ 3024
337
+ ],
338
+ [
339
+ 1008,
340
+ 3360
341
+ ],
342
+ [
343
+ 1008,
344
+ 3696
345
+ ],
346
+ [
347
+ 1008,
348
+ 4032
349
+ ],
350
+ [
351
+ 1008,
352
+ 4368
353
+ ],
354
+ [
355
+ 1008,
356
+ 4704
357
+ ],
358
+ [
359
+ 1008,
360
+ 5040
361
+ ],
362
+ [
363
+ 1008,
364
+ 5376
365
+ ],
366
+ [
367
+ 1344,
368
+ 336
369
+ ],
370
+ [
371
+ 1344,
372
+ 672
373
+ ],
374
+ [
375
+ 1344,
376
+ 1008
377
+ ],
378
+ [
379
+ 1344,
380
+ 1344
381
+ ],
382
+ [
383
+ 1344,
384
+ 1680
385
+ ],
386
+ [
387
+ 1344,
388
+ 2016
389
+ ],
390
+ [
391
+ 1344,
392
+ 2352
393
+ ],
394
+ [
395
+ 1344,
396
+ 2688
397
+ ],
398
+ [
399
+ 1344,
400
+ 3024
401
+ ],
402
+ [
403
+ 1344,
404
+ 3360
405
+ ],
406
+ [
407
+ 1344,
408
+ 3696
409
+ ],
410
+ [
411
+ 1344,
412
+ 4032
413
+ ],
414
+ [
415
+ 1680,
416
+ 336
417
+ ],
418
+ [
419
+ 1680,
420
+ 672
421
+ ],
422
+ [
423
+ 1680,
424
+ 1008
425
+ ],
426
+ [
427
+ 1680,
428
+ 1344
429
+ ],
430
+ [
431
+ 1680,
432
+ 1680
433
+ ],
434
+ [
435
+ 1680,
436
+ 2016
437
+ ],
438
+ [
439
+ 1680,
440
+ 2352
441
+ ],
442
+ [
443
+ 1680,
444
+ 2688
445
+ ],
446
+ [
447
+ 1680,
448
+ 3024
449
+ ],
450
+ [
451
+ 2016,
452
+ 336
453
+ ],
454
+ [
455
+ 2016,
456
+ 672
457
+ ],
458
+ [
459
+ 2016,
460
+ 1008
461
+ ],
462
+ [
463
+ 2016,
464
+ 1344
465
+ ],
466
+ [
467
+ 2016,
468
+ 1680
469
+ ],
470
+ [
471
+ 2016,
472
+ 2016
473
+ ],
474
+ [
475
+ 2016,
476
+ 2352
477
+ ],
478
+ [
479
+ 2016,
480
+ 2688
481
+ ],
482
+ [
483
+ 2352,
484
+ 336
485
+ ],
486
+ [
487
+ 2352,
488
+ 672
489
+ ],
490
+ [
491
+ 2352,
492
+ 1008
493
+ ],
494
+ [
495
+ 2352,
496
+ 1344
497
+ ],
498
+ [
499
+ 2352,
500
+ 1680
501
+ ],
502
+ [
503
+ 2352,
504
+ 2016
505
+ ],
506
+ [
507
+ 2352,
508
+ 2352
509
+ ],
510
+ [
511
+ 2688,
512
+ 336
513
+ ],
514
+ [
515
+ 2688,
516
+ 672
517
+ ],
518
+ [
519
+ 2688,
520
+ 1008
521
+ ],
522
+ [
523
+ 2688,
524
+ 1344
525
+ ],
526
+ [
527
+ 2688,
528
+ 1680
529
+ ],
530
+ [
531
+ 2688,
532
+ 2016
533
+ ],
534
+ [
535
+ 3024,
536
+ 336
537
+ ],
538
+ [
539
+ 3024,
540
+ 672
541
+ ],
542
+ [
543
+ 3024,
544
+ 1008
545
+ ],
546
+ [
547
+ 3024,
548
+ 1344
549
+ ],
550
+ [
551
+ 3024,
552
+ 1680
553
+ ],
554
+ [
555
+ 3360,
556
+ 336
557
+ ],
558
+ [
559
+ 3360,
560
+ 672
561
+ ],
562
+ [
563
+ 3360,
564
+ 1008
565
+ ],
566
+ [
567
+ 3360,
568
+ 1344
569
+ ],
570
+ [
571
+ 3696,
572
+ 336
573
+ ],
574
+ [
575
+ 3696,
576
+ 672
577
+ ],
578
+ [
579
+ 3696,
580
+ 1008
581
+ ],
582
+ [
583
+ 3696,
584
+ 1344
585
+ ],
586
+ [
587
+ 4032,
588
+ 336
589
+ ],
590
+ [
591
+ 4032,
592
+ 672
593
+ ],
594
+ [
595
+ 4032,
596
+ 1008
597
+ ],
598
+ [
599
+ 4032,
600
+ 1344
601
+ ],
602
+ [
603
+ 4368,
604
+ 336
605
+ ],
606
+ [
607
+ 4368,
608
+ 672
609
+ ],
610
+ [
611
+ 4368,
612
+ 1008
613
+ ],
614
+ [
615
+ 4704,
616
+ 336
617
+ ],
618
+ [
619
+ 4704,
620
+ 672
621
+ ],
622
+ [
623
+ 4704,
624
+ 1008
625
+ ],
626
+ [
627
+ 5040,
628
+ 336
629
+ ],
630
+ [
631
+ 5040,
632
+ 672
633
+ ],
634
+ [
635
+ 5040,
636
+ 1008
637
+ ],
638
+ [
639
+ 5376,
640
+ 336
641
+ ],
642
+ [
643
+ 5376,
644
+ 672
645
+ ],
646
+ [
647
+ 5376,
648
+ 1008
649
+ ],
650
+ [
651
+ 5712,
652
+ 336
653
+ ],
654
+ [
655
+ 5712,
656
+ 672
657
+ ],
658
+ [
659
+ 6048,
660
+ 336
661
+ ],
662
+ [
663
+ 6048,
664
+ 672
665
+ ],
666
+ [
667
+ 6384,
668
+ 336
669
+ ],
670
+ [
671
+ 6384,
672
+ 672
673
+ ],
674
+ [
675
+ 6720,
676
+ 336
677
+ ],
678
+ [
679
+ 6720,
680
+ 672
681
+ ],
682
+ [
683
+ 7056,
684
+ 336
685
+ ],
686
+ [
687
+ 7056,
688
+ 672
689
+ ],
690
+ [
691
+ 7392,
692
+ 336
693
+ ],
694
+ [
695
+ 7392,
696
+ 672
697
+ ],
698
+ [
699
+ 7728,
700
+ 336
701
+ ],
702
+ [
703
+ 7728,
704
+ 672
705
+ ],
706
+ [
707
+ 8064,
708
+ 336
709
+ ],
710
+ [
711
+ 8064,
712
+ 672
713
+ ],
714
+ [
715
+ 8400,
716
+ 336
717
+ ],
718
+ [
719
+ 8736,
720
+ 336
721
+ ],
722
+ [
723
+ 9072,
724
+ 336
725
+ ],
726
+ [
727
+ 9408,
728
+ 336
729
+ ],
730
+ [
731
+ 9744,
732
+ 336
733
+ ],
734
+ [
735
+ 10080,
736
+ 336
737
+ ],
738
+ [
739
+ 10416,
740
+ 336
741
+ ],
742
+ [
743
+ 10752,
744
+ 336
745
+ ],
746
+ [
747
+ 11088,
748
+ 336
749
+ ],
750
+ [
751
+ 11424,
752
+ 336
753
+ ],
754
+ [
755
+ 11760,
756
+ 336
757
+ ],
758
+ [
759
+ 12096,
760
+ 336
761
+ ],
762
+ [
763
+ 12432,
764
+ 336
765
+ ],
766
+ [
767
+ 12768,
768
+ 336
769
+ ],
770
+ [
771
+ 13104,
772
+ 336
773
+ ],
774
+ [
775
+ 13440,
776
+ 336
777
+ ],
778
+ [
779
+ 13776,
780
+ 336
781
+ ],
782
+ [
783
+ 14112,
784
+ 336
785
+ ],
786
+ [
787
+ 14448,
788
+ 336
789
+ ],
790
+ [
791
+ 14784,
792
+ 336
793
+ ],
794
+ [
795
+ 15120,
796
+ 336
797
+ ],
798
+ [
799
+ 15456,
800
+ 336
801
+ ],
802
+ [
803
+ 15792,
804
+ 336
805
+ ],
806
+ [
807
+ 16128,
808
+ 336
809
+ ],
810
+ [
811
+ 16464,
812
+ 336
813
+ ]
814
+ ],
815
+ "image_split_resolution": null,
816
+ "initializer_range": 0.02,
817
+ "intermediate_size": 18944,
818
+ "max_position_embeddings": 224000,
819
+ "max_window_layers": 28,
820
+ "mm_hidden_size": 1024,
821
+ "mm_patch_merge_type": "unires",
822
+ "mm_projector_lr": null,
823
+ "mm_projector_type": "mlp2x_gelu",
824
+ "mm_resampler_type": null,
825
+ "mm_spatial_pool_mode": "average",
826
+ "mm_spatial_pool_stride": 2,
827
+ "mm_tunable_parts": "speech_projector,mm_mlp_adapter,mm_language_model",
828
+ "mm_use_im_patch_token": false,
829
+ "mm_use_im_start_end": false,
830
+ "mm_vision_select_feature": "patch",
831
+ "mm_vision_select_layer": -2,
832
+ "mm_vision_tower": "checkpoints/clip-vit-large-patch14-336",
833
+ "mm_vision_tower_lr": 2e-06,
834
+ "model_type": "qwen2",
835
+ "num_attention_heads": 28,
836
+ "num_hidden_layers": 28,
837
+ "num_key_value_heads": 4,
838
+ "pos_skipping_range": 4096,
839
+ "rms_norm_eps": 1e-06,
840
+ "rope_scaling": null,
841
+ "rope_theta": 1000000000.0,
842
+ "sliding_window": null,
843
+ "speech_encoder": "checkpoints/whisper/large-v3.pt",
844
+ "speech_encoder_ds_rate": 5,
845
+ "speech_encoder_hidden_size": 1280,
846
+ "speech_encoder_type": "whisper",
847
+ "speech_normalize": false,
848
+ "speech_projector_lr": null,
849
+ "speech_projector_type": "linear",
850
+ "tie_word_embeddings": false,
851
+ "tokenizer_model_max_length": 16384,
852
+ "tokenizer_padding_side": "right",
853
+ "torch_dtype": "bfloat16",
854
+ "transformers_version": "4.44.0",
855
+ "use_cache": true,
856
+ "use_mm_proj": true,
857
+ "use_pos_skipping": false,
858
+ "use_sliding_window": false,
859
+ "vision_tower_pretrained": null,
860
+ "vocab_size": 152064
861
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation": "flash_attention_2",
3
+ "bos_token_id": 151643,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 151645,
7
+ 151643
8
+ ],
9
+ "pad_token_id": 151643,
10
+ "repetition_penalty": 1.05,
11
+ "rope_theta": 1000000000.0,
12
+ "temperature": 0.7,
13
+ "top_k": 20,
14
+ "top_p": 0.8,
15
+ "transformers_version": "4.44.0"
16
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eba4d40cf97b4f248a4792134487227c9d7039a238e16eab992a594e9c27d64e
3
+ size 4877660776
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7f8beaca936c534a2389ac8189ed0d43cc25229f669bfc0eeeec7915f587fa0
3
+ size 4932751008
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03ea0b848fe54b19ec312e9782030f627c5a6d5a09d68aad294e555903a5f32c
3
+ size 4998766592
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:768f0129d73f42fdb45c423258b77a0ed5d1b4ae296d689ae4300f19f914f6a2
3
+ size 2377117192
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "model_max_length": 16384,
39
+ "pad_token": "<|endoftext|>",
40
+ "padding_side": "right",
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "Qwen2Tokenizer",
43
+ "unk_token": null
44
+ }
trainer_state.json ADDED
@@ -0,0 +1,2219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9991967871485944,
5
+ "eval_steps": 500,
6
+ "global_step": 311,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.00321285140562249,
13
+ "grad_norm": 9.468059539794922,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 1.1591,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.00642570281124498,
20
+ "grad_norm": 12.971900939941406,
21
+ "learning_rate": 2.0000000000000003e-06,
22
+ "loss": 0.9396,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.00963855421686747,
27
+ "grad_norm": 9.142277717590332,
28
+ "learning_rate": 3e-06,
29
+ "loss": 1.0339,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.01285140562248996,
34
+ "grad_norm": 7.4195380210876465,
35
+ "learning_rate": 4.000000000000001e-06,
36
+ "loss": 1.0046,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.01606425702811245,
41
+ "grad_norm": 7.382334232330322,
42
+ "learning_rate": 5e-06,
43
+ "loss": 0.8551,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.01927710843373494,
48
+ "grad_norm": 5.827909469604492,
49
+ "learning_rate": 6e-06,
50
+ "loss": 0.8208,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.02248995983935743,
55
+ "grad_norm": 4.3395795822143555,
56
+ "learning_rate": 7e-06,
57
+ "loss": 0.6632,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.02570281124497992,
62
+ "grad_norm": 34.95582962036133,
63
+ "learning_rate": 8.000000000000001e-06,
64
+ "loss": 1.1221,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.02891566265060241,
69
+ "grad_norm": 5.787684440612793,
70
+ "learning_rate": 9e-06,
71
+ "loss": 0.7092,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.0321285140562249,
76
+ "grad_norm": 3.8266000747680664,
77
+ "learning_rate": 1e-05,
78
+ "loss": 0.8341,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.035341365461847386,
83
+ "grad_norm": 7.160825252532959,
84
+ "learning_rate": 9.999727665400876e-06,
85
+ "loss": 0.7178,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.03855421686746988,
90
+ "grad_norm": 3.6943607330322266,
91
+ "learning_rate": 9.998910691269957e-06,
92
+ "loss": 0.8316,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 0.04176706827309237,
97
+ "grad_norm": 4.6069159507751465,
98
+ "learning_rate": 9.99754916660337e-06,
99
+ "loss": 0.631,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 0.04497991967871486,
104
+ "grad_norm": 4.735833168029785,
105
+ "learning_rate": 9.995643239717228e-06,
106
+ "loss": 0.7483,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 0.04819277108433735,
111
+ "grad_norm": 3.0799083709716797,
112
+ "learning_rate": 9.993193118231463e-06,
113
+ "loss": 0.4765,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.05140562248995984,
118
+ "grad_norm": 2.8460216522216797,
119
+ "learning_rate": 9.990199069047216e-06,
120
+ "loss": 0.6183,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.05461847389558233,
125
+ "grad_norm": 3.5829732418060303,
126
+ "learning_rate": 9.986661418317759e-06,
127
+ "loss": 0.7606,
128
+ "step": 17
129
+ },
130
+ {
131
+ "epoch": 0.05783132530120482,
132
+ "grad_norm": 2.786634683609009,
133
+ "learning_rate": 9.982580551412972e-06,
134
+ "loss": 0.6369,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 0.06104417670682731,
139
+ "grad_norm": 3.3990700244903564,
140
+ "learning_rate": 9.977956912877356e-06,
141
+ "loss": 0.5388,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 0.0642570281124498,
146
+ "grad_norm": 2.86079478263855,
147
+ "learning_rate": 9.97279100638161e-06,
148
+ "loss": 0.7238,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 0.06746987951807229,
153
+ "grad_norm": 2.6607353687286377,
154
+ "learning_rate": 9.967083394667763e-06,
155
+ "loss": 0.6505,
156
+ "step": 21
157
+ },
158
+ {
159
+ "epoch": 0.07068273092369477,
160
+ "grad_norm": 2.682445526123047,
161
+ "learning_rate": 9.960834699487873e-06,
162
+ "loss": 0.4418,
163
+ "step": 22
164
+ },
165
+ {
166
+ "epoch": 0.07389558232931727,
167
+ "grad_norm": 2.598595380783081,
168
+ "learning_rate": 9.9540456015363e-06,
169
+ "loss": 0.616,
170
+ "step": 23
171
+ },
172
+ {
173
+ "epoch": 0.07710843373493977,
174
+ "grad_norm": 3.148313045501709,
175
+ "learning_rate": 9.946716840375552e-06,
176
+ "loss": 0.5655,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 0.08032128514056225,
181
+ "grad_norm": 3.6518056392669678,
182
+ "learning_rate": 9.938849214355722e-06,
183
+ "loss": 0.5541,
184
+ "step": 25
185
+ },
186
+ {
187
+ "epoch": 0.08353413654618475,
188
+ "grad_norm": 2.7225069999694824,
189
+ "learning_rate": 9.93044358052752e-06,
190
+ "loss": 0.6182,
191
+ "step": 26
192
+ },
193
+ {
194
+ "epoch": 0.08674698795180723,
195
+ "grad_norm": 2.9630682468414307,
196
+ "learning_rate": 9.921500854548916e-06,
197
+ "loss": 0.5794,
198
+ "step": 27
199
+ },
200
+ {
201
+ "epoch": 0.08995983935742972,
202
+ "grad_norm": 2.659717082977295,
203
+ "learning_rate": 9.912022010585385e-06,
204
+ "loss": 0.5407,
205
+ "step": 28
206
+ },
207
+ {
208
+ "epoch": 0.09317269076305221,
209
+ "grad_norm": 2.64546799659729,
210
+ "learning_rate": 9.902008081203796e-06,
211
+ "loss": 0.5833,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 0.0963855421686747,
216
+ "grad_norm": 3.679818868637085,
217
+ "learning_rate": 9.89146015725993e-06,
218
+ "loss": 0.6286,
219
+ "step": 30
220
+ },
221
+ {
222
+ "epoch": 0.09959839357429719,
223
+ "grad_norm": 3.6220390796661377,
224
+ "learning_rate": 9.880379387779637e-06,
225
+ "loss": 0.717,
226
+ "step": 31
227
+ },
228
+ {
229
+ "epoch": 0.10281124497991968,
230
+ "grad_norm": 2.7515664100646973,
231
+ "learning_rate": 9.868766979833686e-06,
232
+ "loss": 0.5431,
233
+ "step": 32
234
+ },
235
+ {
236
+ "epoch": 0.10602409638554217,
237
+ "grad_norm": 2.658803939819336,
238
+ "learning_rate": 9.856624198406262e-06,
239
+ "loss": 0.6401,
240
+ "step": 33
241
+ },
242
+ {
243
+ "epoch": 0.10923694779116466,
244
+ "grad_norm": 3.2038729190826416,
245
+ "learning_rate": 9.84395236625717e-06,
246
+ "loss": 0.5142,
247
+ "step": 34
248
+ },
249
+ {
250
+ "epoch": 0.11244979919678715,
251
+ "grad_norm": 4.682802200317383,
252
+ "learning_rate": 9.830752863777741e-06,
253
+ "loss": 0.6152,
254
+ "step": 35
255
+ },
256
+ {
257
+ "epoch": 0.11566265060240964,
258
+ "grad_norm": 2.9723594188690186,
259
+ "learning_rate": 9.817027128840462e-06,
260
+ "loss": 0.3418,
261
+ "step": 36
262
+ },
263
+ {
264
+ "epoch": 0.11887550200803212,
265
+ "grad_norm": 3.891775369644165,
266
+ "learning_rate": 9.802776656642341e-06,
267
+ "loss": 0.6575,
268
+ "step": 37
269
+ },
270
+ {
271
+ "epoch": 0.12208835341365462,
272
+ "grad_norm": 2.52054762840271,
273
+ "learning_rate": 9.78800299954203e-06,
274
+ "loss": 0.5537,
275
+ "step": 38
276
+ },
277
+ {
278
+ "epoch": 0.12530120481927712,
279
+ "grad_norm": 2.726982593536377,
280
+ "learning_rate": 9.772707766890726e-06,
281
+ "loss": 0.6654,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 0.1285140562248996,
286
+ "grad_norm": 2.5964579582214355,
287
+ "learning_rate": 9.756892624856848e-06,
288
+ "loss": 0.5041,
289
+ "step": 40
290
+ },
291
+ {
292
+ "epoch": 0.13172690763052208,
293
+ "grad_norm": 3.1076455116271973,
294
+ "learning_rate": 9.740559296244543e-06,
295
+ "loss": 0.4663,
296
+ "step": 41
297
+ },
298
+ {
299
+ "epoch": 0.13493975903614458,
300
+ "grad_norm": 3.2797796726226807,
301
+ "learning_rate": 9.723709560306009e-06,
302
+ "loss": 0.5047,
303
+ "step": 42
304
+ },
305
+ {
306
+ "epoch": 0.13815261044176708,
307
+ "grad_norm": 2.475914478302002,
308
+ "learning_rate": 9.706345252547681e-06,
309
+ "loss": 0.6025,
310
+ "step": 43
311
+ },
312
+ {
313
+ "epoch": 0.14136546184738955,
314
+ "grad_norm": 2.9697823524475098,
315
+ "learning_rate": 9.688468264530278e-06,
316
+ "loss": 0.3588,
317
+ "step": 44
318
+ },
319
+ {
320
+ "epoch": 0.14457831325301204,
321
+ "grad_norm": 2.9819068908691406,
322
+ "learning_rate": 9.670080543662742e-06,
323
+ "loss": 0.5415,
324
+ "step": 45
325
+ },
326
+ {
327
+ "epoch": 0.14779116465863454,
328
+ "grad_norm": 2.63847017288208,
329
+ "learning_rate": 9.651184092990109e-06,
330
+ "loss": 0.548,
331
+ "step": 46
332
+ },
333
+ {
334
+ "epoch": 0.15100401606425704,
335
+ "grad_norm": 2.9994335174560547,
336
+ "learning_rate": 9.631780970975311e-06,
337
+ "loss": 0.5151,
338
+ "step": 47
339
+ },
340
+ {
341
+ "epoch": 0.15421686746987953,
342
+ "grad_norm": 2.440392017364502,
343
+ "learning_rate": 9.611873291274927e-06,
344
+ "loss": 0.4567,
345
+ "step": 48
346
+ },
347
+ {
348
+ "epoch": 0.157429718875502,
349
+ "grad_norm": 2.510368585586548,
350
+ "learning_rate": 9.591463222508947e-06,
351
+ "loss": 0.5311,
352
+ "step": 49
353
+ },
354
+ {
355
+ "epoch": 0.1606425702811245,
356
+ "grad_norm": 2.9306483268737793,
357
+ "learning_rate": 9.570552988024527e-06,
358
+ "loss": 0.3788,
359
+ "step": 50
360
+ },
361
+ {
362
+ "epoch": 0.163855421686747,
363
+ "grad_norm": 3.0692126750946045,
364
+ "learning_rate": 9.5491448656538e-06,
365
+ "loss": 0.5438,
366
+ "step": 51
367
+ },
368
+ {
369
+ "epoch": 0.1670682730923695,
370
+ "grad_norm": 2.9109346866607666,
371
+ "learning_rate": 9.527241187465735e-06,
372
+ "loss": 0.5757,
373
+ "step": 52
374
+ },
375
+ {
376
+ "epoch": 0.17028112449799196,
377
+ "grad_norm": 2.6207594871520996,
378
+ "learning_rate": 9.504844339512096e-06,
379
+ "loss": 0.5567,
380
+ "step": 53
381
+ },
382
+ {
383
+ "epoch": 0.17349397590361446,
384
+ "grad_norm": 2.2211403846740723,
385
+ "learning_rate": 9.481956761567531e-06,
386
+ "loss": 0.3586,
387
+ "step": 54
388
+ },
389
+ {
390
+ "epoch": 0.17670682730923695,
391
+ "grad_norm": 2.620298147201538,
392
+ "learning_rate": 9.458580946863784e-06,
393
+ "loss": 0.4068,
394
+ "step": 55
395
+ },
396
+ {
397
+ "epoch": 0.17991967871485945,
398
+ "grad_norm": 3.2399895191192627,
399
+ "learning_rate": 9.434719441818106e-06,
400
+ "loss": 0.4569,
401
+ "step": 56
402
+ },
403
+ {
404
+ "epoch": 0.18313253012048192,
405
+ "grad_norm": 3.0689821243286133,
406
+ "learning_rate": 9.410374845755862e-06,
407
+ "loss": 0.4875,
408
+ "step": 57
409
+ },
410
+ {
411
+ "epoch": 0.18634538152610441,
412
+ "grad_norm": 2.3414645195007324,
413
+ "learning_rate": 9.385549810627374e-06,
414
+ "loss": 0.4211,
415
+ "step": 58
416
+ },
417
+ {
418
+ "epoch": 0.1895582329317269,
419
+ "grad_norm": 2.688450336456299,
420
+ "learning_rate": 9.36024704071904e-06,
421
+ "loss": 0.3738,
422
+ "step": 59
423
+ },
424
+ {
425
+ "epoch": 0.1927710843373494,
426
+ "grad_norm": 2.5549943447113037,
427
+ "learning_rate": 9.334469292358736e-06,
428
+ "loss": 0.4483,
429
+ "step": 60
430
+ },
431
+ {
432
+ "epoch": 0.19598393574297188,
433
+ "grad_norm": 3.2717857360839844,
434
+ "learning_rate": 9.308219373615574e-06,
435
+ "loss": 0.4543,
436
+ "step": 61
437
+ },
438
+ {
439
+ "epoch": 0.19919678714859437,
440
+ "grad_norm": 2.5511434078216553,
441
+ "learning_rate": 9.28150014399399e-06,
442
+ "loss": 0.5196,
443
+ "step": 62
444
+ },
445
+ {
446
+ "epoch": 0.20240963855421687,
447
+ "grad_norm": 4.289268970489502,
448
+ "learning_rate": 9.25431451412226e-06,
449
+ "loss": 0.4566,
450
+ "step": 63
451
+ },
452
+ {
453
+ "epoch": 0.20562248995983937,
454
+ "grad_norm": 2.4556353092193604,
455
+ "learning_rate": 9.226665445435428e-06,
456
+ "loss": 0.4854,
457
+ "step": 64
458
+ },
459
+ {
460
+ "epoch": 0.20883534136546184,
461
+ "grad_norm": 2.211134910583496,
462
+ "learning_rate": 9.19855594985271e-06,
463
+ "loss": 0.4626,
464
+ "step": 65
465
+ },
466
+ {
467
+ "epoch": 0.21204819277108433,
468
+ "grad_norm": 2.233396530151367,
469
+ "learning_rate": 9.16998908944939e-06,
470
+ "loss": 0.4688,
471
+ "step": 66
472
+ },
473
+ {
474
+ "epoch": 0.21526104417670683,
475
+ "grad_norm": 2.952958106994629,
476
+ "learning_rate": 9.14096797612326e-06,
477
+ "loss": 0.5239,
478
+ "step": 67
479
+ },
480
+ {
481
+ "epoch": 0.21847389558232932,
482
+ "grad_norm": 2.30307936668396,
483
+ "learning_rate": 9.111495771255623e-06,
484
+ "loss": 0.4765,
485
+ "step": 68
486
+ },
487
+ {
488
+ "epoch": 0.2216867469879518,
489
+ "grad_norm": 3.2021255493164062,
490
+ "learning_rate": 9.081575685366919e-06,
491
+ "loss": 0.579,
492
+ "step": 69
493
+ },
494
+ {
495
+ "epoch": 0.2248995983935743,
496
+ "grad_norm": 2.920560359954834,
497
+ "learning_rate": 9.051210977766987e-06,
498
+ "loss": 0.4866,
499
+ "step": 70
500
+ },
501
+ {
502
+ "epoch": 0.2281124497991968,
503
+ "grad_norm": 2.718421697616577,
504
+ "learning_rate": 9.020404956200016e-06,
505
+ "loss": 0.4484,
506
+ "step": 71
507
+ },
508
+ {
509
+ "epoch": 0.23132530120481928,
510
+ "grad_norm": 2.929945707321167,
511
+ "learning_rate": 8.989160976484218e-06,
512
+ "loss": 0.4432,
513
+ "step": 72
514
+ },
515
+ {
516
+ "epoch": 0.23453815261044178,
517
+ "grad_norm": 3.0811638832092285,
518
+ "learning_rate": 8.957482442146271e-06,
519
+ "loss": 0.5614,
520
+ "step": 73
521
+ },
522
+ {
523
+ "epoch": 0.23775100401606425,
524
+ "grad_norm": 3.6012563705444336,
525
+ "learning_rate": 8.925372804050554e-06,
526
+ "loss": 0.4146,
527
+ "step": 74
528
+ },
529
+ {
530
+ "epoch": 0.24096385542168675,
531
+ "grad_norm": 2.4589638710021973,
532
+ "learning_rate": 8.892835560023236e-06,
533
+ "loss": 0.4544,
534
+ "step": 75
535
+ },
536
+ {
537
+ "epoch": 0.24417670682730924,
538
+ "grad_norm": 2.542167901992798,
539
+ "learning_rate": 8.85987425447124e-06,
540
+ "loss": 0.5399,
541
+ "step": 76
542
+ },
543
+ {
544
+ "epoch": 0.24738955823293174,
545
+ "grad_norm": 2.2393405437469482,
546
+ "learning_rate": 8.826492477996138e-06,
547
+ "loss": 0.4348,
548
+ "step": 77
549
+ },
550
+ {
551
+ "epoch": 0.25060240963855424,
552
+ "grad_norm": 2.5719082355499268,
553
+ "learning_rate": 8.792693867003017e-06,
554
+ "loss": 0.5533,
555
+ "step": 78
556
+ },
557
+ {
558
+ "epoch": 0.25381526104417673,
559
+ "grad_norm": 2.2429287433624268,
560
+ "learning_rate": 8.758482103304348e-06,
561
+ "loss": 0.5089,
562
+ "step": 79
563
+ },
564
+ {
565
+ "epoch": 0.2570281124497992,
566
+ "grad_norm": 2.215193033218384,
567
+ "learning_rate": 8.72386091371891e-06,
568
+ "loss": 0.3864,
569
+ "step": 80
570
+ },
571
+ {
572
+ "epoch": 0.26024096385542167,
573
+ "grad_norm": 2.946692943572998,
574
+ "learning_rate": 8.688834069665819e-06,
575
+ "loss": 0.4929,
576
+ "step": 81
577
+ },
578
+ {
579
+ "epoch": 0.26345381526104417,
580
+ "grad_norm": 2.1577095985412598,
581
+ "learning_rate": 8.653405386753688e-06,
582
+ "loss": 0.3006,
583
+ "step": 82
584
+ },
585
+ {
586
+ "epoch": 0.26666666666666666,
587
+ "grad_norm": 2.4236855506896973,
588
+ "learning_rate": 8.617578724364984e-06,
589
+ "loss": 0.3955,
590
+ "step": 83
591
+ },
592
+ {
593
+ "epoch": 0.26987951807228916,
594
+ "grad_norm": 2.1099393367767334,
595
+ "learning_rate": 8.581357985235595e-06,
596
+ "loss": 0.289,
597
+ "step": 84
598
+ },
599
+ {
600
+ "epoch": 0.27309236947791166,
601
+ "grad_norm": 2.321336030960083,
602
+ "learning_rate": 8.544747115029717e-06,
603
+ "loss": 0.3508,
604
+ "step": 85
605
+ },
606
+ {
607
+ "epoch": 0.27630522088353415,
608
+ "grad_norm": 3.0077357292175293,
609
+ "learning_rate": 8.50775010191001e-06,
610
+ "loss": 0.38,
611
+ "step": 86
612
+ },
613
+ {
614
+ "epoch": 0.27951807228915665,
615
+ "grad_norm": 2.451876640319824,
616
+ "learning_rate": 8.470370976103171e-06,
617
+ "loss": 0.3444,
618
+ "step": 87
619
+ },
620
+ {
621
+ "epoch": 0.2827309236947791,
622
+ "grad_norm": 2.5099685192108154,
623
+ "learning_rate": 8.432613809460895e-06,
624
+ "loss": 0.5357,
625
+ "step": 88
626
+ },
627
+ {
628
+ "epoch": 0.2859437751004016,
629
+ "grad_norm": 2.212970018386841,
630
+ "learning_rate": 8.394482715016318e-06,
631
+ "loss": 0.361,
632
+ "step": 89
633
+ },
634
+ {
635
+ "epoch": 0.2891566265060241,
636
+ "grad_norm": 2.779324531555176,
637
+ "learning_rate": 8.355981846535972e-06,
638
+ "loss": 0.5175,
639
+ "step": 90
640
+ },
641
+ {
642
+ "epoch": 0.2923694779116466,
643
+ "grad_norm": 2.486966133117676,
644
+ "learning_rate": 8.317115398067289e-06,
645
+ "loss": 0.4769,
646
+ "step": 91
647
+ },
648
+ {
649
+ "epoch": 0.2955823293172691,
650
+ "grad_norm": 2.3630478382110596,
651
+ "learning_rate": 8.27788760348173e-06,
652
+ "loss": 0.3273,
653
+ "step": 92
654
+ },
655
+ {
656
+ "epoch": 0.2987951807228916,
657
+ "grad_norm": 3.0468597412109375,
658
+ "learning_rate": 8.238302736013587e-06,
659
+ "loss": 0.465,
660
+ "step": 93
661
+ },
662
+ {
663
+ "epoch": 0.30200803212851407,
664
+ "grad_norm": 2.1398801803588867,
665
+ "learning_rate": 8.198365107794457e-06,
666
+ "loss": 0.5196,
667
+ "step": 94
668
+ },
669
+ {
670
+ "epoch": 0.30522088353413657,
671
+ "grad_norm": 2.3869407176971436,
672
+ "learning_rate": 8.158079069383535e-06,
673
+ "loss": 0.3714,
674
+ "step": 95
675
+ },
676
+ {
677
+ "epoch": 0.30843373493975906,
678
+ "grad_norm": 2.6658105850219727,
679
+ "learning_rate": 8.117449009293668e-06,
680
+ "loss": 0.4704,
681
+ "step": 96
682
+ },
683
+ {
684
+ "epoch": 0.3116465863453815,
685
+ "grad_norm": 3.014535665512085,
686
+ "learning_rate": 8.076479353513308e-06,
687
+ "loss": 0.4581,
688
+ "step": 97
689
+ },
690
+ {
691
+ "epoch": 0.314859437751004,
692
+ "grad_norm": 2.575618028640747,
693
+ "learning_rate": 8.035174565024362e-06,
694
+ "loss": 0.4222,
695
+ "step": 98
696
+ },
697
+ {
698
+ "epoch": 0.3180722891566265,
699
+ "grad_norm": 2.499056577682495,
700
+ "learning_rate": 7.993539143316044e-06,
701
+ "loss": 0.3373,
702
+ "step": 99
703
+ },
704
+ {
705
+ "epoch": 0.321285140562249,
706
+ "grad_norm": 1.9054265022277832,
707
+ "learning_rate": 7.951577623894701e-06,
708
+ "loss": 0.4234,
709
+ "step": 100
710
+ },
711
+ {
712
+ "epoch": 0.3244979919678715,
713
+ "grad_norm": 2.496711015701294,
714
+ "learning_rate": 7.909294577789765e-06,
715
+ "loss": 0.3642,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 0.327710843373494,
720
+ "grad_norm": 2.4711644649505615,
721
+ "learning_rate": 7.866694611055796e-06,
722
+ "loss": 0.3368,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 0.3309236947791165,
727
+ "grad_norm": 2.246197462081909,
728
+ "learning_rate": 7.823782364270743e-06,
729
+ "loss": 0.3013,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 0.334136546184739,
734
+ "grad_norm": 2.3415684700012207,
735
+ "learning_rate": 7.780562512030414e-06,
736
+ "loss": 0.4313,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 0.3373493975903614,
741
+ "grad_norm": 2.7522289752960205,
742
+ "learning_rate": 7.737039762439263e-06,
743
+ "loss": 0.5178,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 0.3405622489959839,
748
+ "grad_norm": 2.612412452697754,
749
+ "learning_rate": 7.693218856597515e-06,
750
+ "loss": 0.4683,
751
+ "step": 106
752
+ },
753
+ {
754
+ "epoch": 0.3437751004016064,
755
+ "grad_norm": 2.491758108139038,
756
+ "learning_rate": 7.649104568084701e-06,
757
+ "loss": 0.4714,
758
+ "step": 107
759
+ },
760
+ {
761
+ "epoch": 0.3469879518072289,
762
+ "grad_norm": 2.803365707397461,
763
+ "learning_rate": 7.604701702439652e-06,
764
+ "loss": 0.4583,
765
+ "step": 108
766
+ },
767
+ {
768
+ "epoch": 0.3502008032128514,
769
+ "grad_norm": 2.161888360977173,
770
+ "learning_rate": 7.560015096637015e-06,
771
+ "loss": 0.3207,
772
+ "step": 109
773
+ },
774
+ {
775
+ "epoch": 0.3534136546184739,
776
+ "grad_norm": 2.9406943321228027,
777
+ "learning_rate": 7.515049618560337e-06,
778
+ "loss": 0.4199,
779
+ "step": 110
780
+ },
781
+ {
782
+ "epoch": 0.3566265060240964,
783
+ "grad_norm": 2.4751358032226562,
784
+ "learning_rate": 7.469810166471802e-06,
785
+ "loss": 0.3597,
786
+ "step": 111
787
+ },
788
+ {
789
+ "epoch": 0.3598393574297189,
790
+ "grad_norm": 2.4176955223083496,
791
+ "learning_rate": 7.424301668478626e-06,
792
+ "loss": 0.4643,
793
+ "step": 112
794
+ },
795
+ {
796
+ "epoch": 0.36305220883534134,
797
+ "grad_norm": 1.987687110900879,
798
+ "learning_rate": 7.378529081996233e-06,
799
+ "loss": 0.3283,
800
+ "step": 113
801
+ },
802
+ {
803
+ "epoch": 0.36626506024096384,
804
+ "grad_norm": 2.1835029125213623,
805
+ "learning_rate": 7.332497393208221e-06,
806
+ "loss": 0.3521,
807
+ "step": 114
808
+ },
809
+ {
810
+ "epoch": 0.36947791164658633,
811
+ "grad_norm": 2.5302932262420654,
812
+ "learning_rate": 7.286211616523193e-06,
813
+ "loss": 0.397,
814
+ "step": 115
815
+ },
816
+ {
817
+ "epoch": 0.37269076305220883,
818
+ "grad_norm": 2.3871219158172607,
819
+ "learning_rate": 7.239676794028526e-06,
820
+ "loss": 0.4685,
821
+ "step": 116
822
+ },
823
+ {
824
+ "epoch": 0.3759036144578313,
825
+ "grad_norm": 2.5505502223968506,
826
+ "learning_rate": 7.192897994941111e-06,
827
+ "loss": 0.4473,
828
+ "step": 117
829
+ },
830
+ {
831
+ "epoch": 0.3791164658634538,
832
+ "grad_norm": 2.5716140270233154,
833
+ "learning_rate": 7.145880315055145e-06,
834
+ "loss": 0.2596,
835
+ "step": 118
836
+ },
837
+ {
838
+ "epoch": 0.3823293172690763,
839
+ "grad_norm": 2.9478588104248047,
840
+ "learning_rate": 7.098628876187031e-06,
841
+ "loss": 0.3903,
842
+ "step": 119
843
+ },
844
+ {
845
+ "epoch": 0.3855421686746988,
846
+ "grad_norm": 2.7252838611602783,
847
+ "learning_rate": 7.051148825617435e-06,
848
+ "loss": 0.3011,
849
+ "step": 120
850
+ },
851
+ {
852
+ "epoch": 0.3887550200803213,
853
+ "grad_norm": 1.9095062017440796,
854
+ "learning_rate": 7.003445335530572e-06,
855
+ "loss": 0.242,
856
+ "step": 121
857
+ },
858
+ {
859
+ "epoch": 0.39196787148594375,
860
+ "grad_norm": 2.5876998901367188,
861
+ "learning_rate": 6.95552360245078e-06,
862
+ "loss": 0.4669,
863
+ "step": 122
864
+ },
865
+ {
866
+ "epoch": 0.39518072289156625,
867
+ "grad_norm": 2.438539505004883,
868
+ "learning_rate": 6.9073888466764495e-06,
869
+ "loss": 0.2939,
870
+ "step": 123
871
+ },
872
+ {
873
+ "epoch": 0.39839357429718875,
874
+ "grad_norm": 2.326950788497925,
875
+ "learning_rate": 6.859046311711344e-06,
876
+ "loss": 0.3136,
877
+ "step": 124
878
+ },
879
+ {
880
+ "epoch": 0.40160642570281124,
881
+ "grad_norm": 2.442441463470459,
882
+ "learning_rate": 6.810501263693416e-06,
883
+ "loss": 0.241,
884
+ "step": 125
885
+ },
886
+ {
887
+ "epoch": 0.40481927710843374,
888
+ "grad_norm": 2.1230807304382324,
889
+ "learning_rate": 6.761758990821143e-06,
890
+ "loss": 0.341,
891
+ "step": 126
892
+ },
893
+ {
894
+ "epoch": 0.40803212851405624,
895
+ "grad_norm": 2.29844069480896,
896
+ "learning_rate": 6.712824802777465e-06,
897
+ "loss": 0.2936,
898
+ "step": 127
899
+ },
900
+ {
901
+ "epoch": 0.41124497991967873,
902
+ "grad_norm": 2.681148052215576,
903
+ "learning_rate": 6.66370403015137e-06,
904
+ "loss": 0.271,
905
+ "step": 128
906
+ },
907
+ {
908
+ "epoch": 0.41445783132530123,
909
+ "grad_norm": 2.411924123764038,
910
+ "learning_rate": 6.614402023857231e-06,
911
+ "loss": 0.4166,
912
+ "step": 129
913
+ },
914
+ {
915
+ "epoch": 0.41767068273092367,
916
+ "grad_norm": 2.795295238494873,
917
+ "learning_rate": 6.564924154551895e-06,
918
+ "loss": 0.3683,
919
+ "step": 130
920
+ },
921
+ {
922
+ "epoch": 0.42088353413654617,
923
+ "grad_norm": 2.0868473052978516,
924
+ "learning_rate": 6.515275812049644e-06,
925
+ "loss": 0.2563,
926
+ "step": 131
927
+ },
928
+ {
929
+ "epoch": 0.42409638554216866,
930
+ "grad_norm": 2.622237205505371,
931
+ "learning_rate": 6.4654624047350575e-06,
932
+ "loss": 0.3725,
933
+ "step": 132
934
+ },
935
+ {
936
+ "epoch": 0.42730923694779116,
937
+ "grad_norm": 2.958596706390381,
938
+ "learning_rate": 6.41548935897386e-06,
939
+ "loss": 0.2818,
940
+ "step": 133
941
+ },
942
+ {
943
+ "epoch": 0.43052208835341366,
944
+ "grad_norm": 2.81485915184021,
945
+ "learning_rate": 6.365362118521807e-06,
946
+ "loss": 0.3887,
947
+ "step": 134
948
+ },
949
+ {
950
+ "epoch": 0.43373493975903615,
951
+ "grad_norm": 2.5471572875976562,
952
+ "learning_rate": 6.31508614393167e-06,
953
+ "loss": 0.3455,
954
+ "step": 135
955
+ },
956
+ {
957
+ "epoch": 0.43694779116465865,
958
+ "grad_norm": 5.213260173797607,
959
+ "learning_rate": 6.264666911958404e-06,
960
+ "loss": 0.3154,
961
+ "step": 136
962
+ },
963
+ {
964
+ "epoch": 0.44016064257028115,
965
+ "grad_norm": 2.910991668701172,
966
+ "learning_rate": 6.214109914962542e-06,
967
+ "loss": 0.3963,
968
+ "step": 137
969
+ },
970
+ {
971
+ "epoch": 0.4433734939759036,
972
+ "grad_norm": 2.244249105453491,
973
+ "learning_rate": 6.1634206603118844e-06,
974
+ "loss": 0.2974,
975
+ "step": 138
976
+ },
977
+ {
978
+ "epoch": 0.4465863453815261,
979
+ "grad_norm": 2.3065643310546875,
980
+ "learning_rate": 6.112604669781572e-06,
981
+ "loss": 0.344,
982
+ "step": 139
983
+ },
984
+ {
985
+ "epoch": 0.4497991967871486,
986
+ "grad_norm": 2.094327211380005,
987
+ "learning_rate": 6.06166747895257e-06,
988
+ "loss": 0.3327,
989
+ "step": 140
990
+ },
991
+ {
992
+ "epoch": 0.4530120481927711,
993
+ "grad_norm": 2.5069355964660645,
994
+ "learning_rate": 6.0106146366086514e-06,
995
+ "loss": 0.2542,
996
+ "step": 141
997
+ },
998
+ {
999
+ "epoch": 0.4562248995983936,
1000
+ "grad_norm": 2.227370262145996,
1001
+ "learning_rate": 5.959451704131962e-06,
1002
+ "loss": 0.3063,
1003
+ "step": 142
1004
+ },
1005
+ {
1006
+ "epoch": 0.45943775100401607,
1007
+ "grad_norm": 3.291332244873047,
1008
+ "learning_rate": 5.908184254897183e-06,
1009
+ "loss": 0.3719,
1010
+ "step": 143
1011
+ },
1012
+ {
1013
+ "epoch": 0.46265060240963857,
1014
+ "grad_norm": 2.1085941791534424,
1015
+ "learning_rate": 5.856817873664409e-06,
1016
+ "loss": 0.3306,
1017
+ "step": 144
1018
+ },
1019
+ {
1020
+ "epoch": 0.46586345381526106,
1021
+ "grad_norm": 2.1606924533843994,
1022
+ "learning_rate": 5.8053581559707754e-06,
1023
+ "loss": 0.3847,
1024
+ "step": 145
1025
+ },
1026
+ {
1027
+ "epoch": 0.46907630522088356,
1028
+ "grad_norm": 3.6136634349823,
1029
+ "learning_rate": 5.753810707520918e-06,
1030
+ "loss": 0.409,
1031
+ "step": 146
1032
+ },
1033
+ {
1034
+ "epoch": 0.472289156626506,
1035
+ "grad_norm": 2.9589033126831055,
1036
+ "learning_rate": 5.702181143576323e-06,
1037
+ "loss": 0.519,
1038
+ "step": 147
1039
+ },
1040
+ {
1041
+ "epoch": 0.4755020080321285,
1042
+ "grad_norm": 2.941072463989258,
1043
+ "learning_rate": 5.6504750883436275e-06,
1044
+ "loss": 0.3586,
1045
+ "step": 148
1046
+ },
1047
+ {
1048
+ "epoch": 0.478714859437751,
1049
+ "grad_norm": 2.1762218475341797,
1050
+ "learning_rate": 5.5986981743619615e-06,
1051
+ "loss": 0.3692,
1052
+ "step": 149
1053
+ },
1054
+ {
1055
+ "epoch": 0.4819277108433735,
1056
+ "grad_norm": 1.9385762214660645,
1057
+ "learning_rate": 5.546856041889374e-06,
1058
+ "loss": 0.3044,
1059
+ "step": 150
1060
+ },
1061
+ {
1062
+ "epoch": 0.485140562248996,
1063
+ "grad_norm": 2.7357289791107178,
1064
+ "learning_rate": 5.494954338288404e-06,
1065
+ "loss": 0.4027,
1066
+ "step": 151
1067
+ },
1068
+ {
1069
+ "epoch": 0.4883534136546185,
1070
+ "grad_norm": 2.4241161346435547,
1071
+ "learning_rate": 5.442998717410916e-06,
1072
+ "loss": 0.3591,
1073
+ "step": 152
1074
+ },
1075
+ {
1076
+ "epoch": 0.491566265060241,
1077
+ "grad_norm": 5.211270809173584,
1078
+ "learning_rate": 5.390994838982178e-06,
1079
+ "loss": 0.3662,
1080
+ "step": 153
1081
+ },
1082
+ {
1083
+ "epoch": 0.4947791164658635,
1084
+ "grad_norm": 2.5617518424987793,
1085
+ "learning_rate": 5.338948367984347e-06,
1086
+ "loss": 0.3067,
1087
+ "step": 154
1088
+ },
1089
+ {
1090
+ "epoch": 0.4979919678714859,
1091
+ "grad_norm": 2.434497356414795,
1092
+ "learning_rate": 5.286864974039349e-06,
1093
+ "loss": 0.3023,
1094
+ "step": 155
1095
+ },
1096
+ {
1097
+ "epoch": 0.5012048192771085,
1098
+ "grad_norm": 2.5631253719329834,
1099
+ "learning_rate": 5.234750330791268e-06,
1100
+ "loss": 0.3823,
1101
+ "step": 156
1102
+ },
1103
+ {
1104
+ "epoch": 0.5044176706827309,
1105
+ "grad_norm": 2.003878593444824,
1106
+ "learning_rate": 5.182610115288296e-06,
1107
+ "loss": 0.361,
1108
+ "step": 157
1109
+ },
1110
+ {
1111
+ "epoch": 0.5076305220883535,
1112
+ "grad_norm": 2.133862018585205,
1113
+ "learning_rate": 5.1304500073643045e-06,
1114
+ "loss": 0.2718,
1115
+ "step": 158
1116
+ },
1117
+ {
1118
+ "epoch": 0.5108433734939759,
1119
+ "grad_norm": 2.344472646713257,
1120
+ "learning_rate": 5.078275689020129e-06,
1121
+ "loss": 0.2921,
1122
+ "step": 159
1123
+ },
1124
+ {
1125
+ "epoch": 0.5140562248995983,
1126
+ "grad_norm": 2.4135284423828125,
1127
+ "learning_rate": 5.026092843804599e-06,
1128
+ "loss": 0.3341,
1129
+ "step": 160
1130
+ },
1131
+ {
1132
+ "epoch": 0.5172690763052209,
1133
+ "grad_norm": 2.5881218910217285,
1134
+ "learning_rate": 4.973907156195405e-06,
1135
+ "loss": 0.2986,
1136
+ "step": 161
1137
+ },
1138
+ {
1139
+ "epoch": 0.5204819277108433,
1140
+ "grad_norm": 2.7781827449798584,
1141
+ "learning_rate": 4.921724310979872e-06,
1142
+ "loss": 0.4795,
1143
+ "step": 162
1144
+ },
1145
+ {
1146
+ "epoch": 0.5236947791164659,
1147
+ "grad_norm": 2.816742181777954,
1148
+ "learning_rate": 4.869549992635697e-06,
1149
+ "loss": 0.4024,
1150
+ "step": 163
1151
+ },
1152
+ {
1153
+ "epoch": 0.5269076305220883,
1154
+ "grad_norm": 2.479278326034546,
1155
+ "learning_rate": 4.817389884711706e-06,
1156
+ "loss": 0.381,
1157
+ "step": 164
1158
+ },
1159
+ {
1160
+ "epoch": 0.5301204819277109,
1161
+ "grad_norm": 2.0837013721466064,
1162
+ "learning_rate": 4.765249669208733e-06,
1163
+ "loss": 0.2775,
1164
+ "step": 165
1165
+ },
1166
+ {
1167
+ "epoch": 0.5333333333333333,
1168
+ "grad_norm": 3.396789312362671,
1169
+ "learning_rate": 4.713135025960652e-06,
1170
+ "loss": 0.3469,
1171
+ "step": 166
1172
+ },
1173
+ {
1174
+ "epoch": 0.5365461847389559,
1175
+ "grad_norm": 2.179645299911499,
1176
+ "learning_rate": 4.661051632015655e-06,
1177
+ "loss": 0.375,
1178
+ "step": 167
1179
+ },
1180
+ {
1181
+ "epoch": 0.5397590361445783,
1182
+ "grad_norm": 2.4666926860809326,
1183
+ "learning_rate": 4.609005161017824e-06,
1184
+ "loss": 0.3723,
1185
+ "step": 168
1186
+ },
1187
+ {
1188
+ "epoch": 0.5429718875502008,
1189
+ "grad_norm": 2.0646708011627197,
1190
+ "learning_rate": 4.557001282589086e-06,
1191
+ "loss": 0.3036,
1192
+ "step": 169
1193
+ },
1194
+ {
1195
+ "epoch": 0.5461847389558233,
1196
+ "grad_norm": 2.0979270935058594,
1197
+ "learning_rate": 4.505045661711596e-06,
1198
+ "loss": 0.312,
1199
+ "step": 170
1200
+ },
1201
+ {
1202
+ "epoch": 0.5493975903614458,
1203
+ "grad_norm": 2.0291059017181396,
1204
+ "learning_rate": 4.4531439581106295e-06,
1205
+ "loss": 0.3151,
1206
+ "step": 171
1207
+ },
1208
+ {
1209
+ "epoch": 0.5526104417670683,
1210
+ "grad_norm": 1.8637864589691162,
1211
+ "learning_rate": 4.401301825638039e-06,
1212
+ "loss": 0.2561,
1213
+ "step": 172
1214
+ },
1215
+ {
1216
+ "epoch": 0.5558232931726907,
1217
+ "grad_norm": 2.0765888690948486,
1218
+ "learning_rate": 4.349524911656373e-06,
1219
+ "loss": 0.3333,
1220
+ "step": 173
1221
+ },
1222
+ {
1223
+ "epoch": 0.5590361445783133,
1224
+ "grad_norm": 2.5530149936676025,
1225
+ "learning_rate": 4.297818856423679e-06,
1226
+ "loss": 0.3819,
1227
+ "step": 174
1228
+ },
1229
+ {
1230
+ "epoch": 0.5622489959839357,
1231
+ "grad_norm": 2.2014477252960205,
1232
+ "learning_rate": 4.2461892924790825e-06,
1233
+ "loss": 0.254,
1234
+ "step": 175
1235
+ },
1236
+ {
1237
+ "epoch": 0.5654618473895582,
1238
+ "grad_norm": 3.6043734550476074,
1239
+ "learning_rate": 4.194641844029227e-06,
1240
+ "loss": 0.4335,
1241
+ "step": 176
1242
+ },
1243
+ {
1244
+ "epoch": 0.5686746987951807,
1245
+ "grad_norm": 2.4686837196350098,
1246
+ "learning_rate": 4.143182126335594e-06,
1247
+ "loss": 0.2313,
1248
+ "step": 177
1249
+ },
1250
+ {
1251
+ "epoch": 0.5718875502008032,
1252
+ "grad_norm": 1.9996862411499023,
1253
+ "learning_rate": 4.091815745102818e-06,
1254
+ "loss": 0.296,
1255
+ "step": 178
1256
+ },
1257
+ {
1258
+ "epoch": 0.5751004016064257,
1259
+ "grad_norm": 2.37868595123291,
1260
+ "learning_rate": 4.040548295868039e-06,
1261
+ "loss": 0.2864,
1262
+ "step": 179
1263
+ },
1264
+ {
1265
+ "epoch": 0.5783132530120482,
1266
+ "grad_norm": 2.2569093704223633,
1267
+ "learning_rate": 3.9893853633913485e-06,
1268
+ "loss": 0.2444,
1269
+ "step": 180
1270
+ },
1271
+ {
1272
+ "epoch": 0.5815261044176707,
1273
+ "grad_norm": 3.60836124420166,
1274
+ "learning_rate": 3.938332521047434e-06,
1275
+ "loss": 0.3296,
1276
+ "step": 181
1277
+ },
1278
+ {
1279
+ "epoch": 0.5847389558232932,
1280
+ "grad_norm": 2.706819772720337,
1281
+ "learning_rate": 3.887395330218429e-06,
1282
+ "loss": 0.1887,
1283
+ "step": 182
1284
+ },
1285
+ {
1286
+ "epoch": 0.5879518072289157,
1287
+ "grad_norm": 2.409383535385132,
1288
+ "learning_rate": 3.836579339688116e-06,
1289
+ "loss": 0.29,
1290
+ "step": 183
1291
+ },
1292
+ {
1293
+ "epoch": 0.5911646586345382,
1294
+ "grad_norm": 2.9502406120300293,
1295
+ "learning_rate": 3.7858900850374596e-06,
1296
+ "loss": 0.3951,
1297
+ "step": 184
1298
+ },
1299
+ {
1300
+ "epoch": 0.5943775100401606,
1301
+ "grad_norm": 2.052128314971924,
1302
+ "learning_rate": 3.7353330880415963e-06,
1303
+ "loss": 0.3078,
1304
+ "step": 185
1305
+ },
1306
+ {
1307
+ "epoch": 0.5975903614457831,
1308
+ "grad_norm": 2.404327630996704,
1309
+ "learning_rate": 3.6849138560683305e-06,
1310
+ "loss": 0.4112,
1311
+ "step": 186
1312
+ },
1313
+ {
1314
+ "epoch": 0.6008032128514056,
1315
+ "grad_norm": 2.3815231323242188,
1316
+ "learning_rate": 3.634637881478196e-06,
1317
+ "loss": 0.322,
1318
+ "step": 187
1319
+ },
1320
+ {
1321
+ "epoch": 0.6040160642570281,
1322
+ "grad_norm": 2.2108402252197266,
1323
+ "learning_rate": 3.5845106410261417e-06,
1324
+ "loss": 0.3744,
1325
+ "step": 188
1326
+ },
1327
+ {
1328
+ "epoch": 0.6072289156626506,
1329
+ "grad_norm": 2.1574432849884033,
1330
+ "learning_rate": 3.534537595264944e-06,
1331
+ "loss": 0.2951,
1332
+ "step": 189
1333
+ },
1334
+ {
1335
+ "epoch": 0.6104417670682731,
1336
+ "grad_norm": 2.283750057220459,
1337
+ "learning_rate": 3.4847241879503574e-06,
1338
+ "loss": 0.2728,
1339
+ "step": 190
1340
+ },
1341
+ {
1342
+ "epoch": 0.6136546184738956,
1343
+ "grad_norm": 2.0512266159057617,
1344
+ "learning_rate": 3.435075845448105e-06,
1345
+ "loss": 0.2585,
1346
+ "step": 191
1347
+ },
1348
+ {
1349
+ "epoch": 0.6168674698795181,
1350
+ "grad_norm": 2.368990182876587,
1351
+ "learning_rate": 3.3855979761427705e-06,
1352
+ "loss": 0.4226,
1353
+ "step": 192
1354
+ },
1355
+ {
1356
+ "epoch": 0.6200803212851406,
1357
+ "grad_norm": 2.43058705329895,
1358
+ "learning_rate": 3.3362959698486307e-06,
1359
+ "loss": 0.3779,
1360
+ "step": 193
1361
+ },
1362
+ {
1363
+ "epoch": 0.623293172690763,
1364
+ "grad_norm": 2.8462181091308594,
1365
+ "learning_rate": 3.287175197222537e-06,
1366
+ "loss": 0.4504,
1367
+ "step": 194
1368
+ },
1369
+ {
1370
+ "epoch": 0.6265060240963856,
1371
+ "grad_norm": 2.358142614364624,
1372
+ "learning_rate": 3.2382410091788567e-06,
1373
+ "loss": 0.3824,
1374
+ "step": 195
1375
+ },
1376
+ {
1377
+ "epoch": 0.629718875502008,
1378
+ "grad_norm": 1.9749348163604736,
1379
+ "learning_rate": 3.189498736306584e-06,
1380
+ "loss": 0.3309,
1381
+ "step": 196
1382
+ },
1383
+ {
1384
+ "epoch": 0.6329317269076306,
1385
+ "grad_norm": 2.166022300720215,
1386
+ "learning_rate": 3.140953688288658e-06,
1387
+ "loss": 0.1816,
1388
+ "step": 197
1389
+ },
1390
+ {
1391
+ "epoch": 0.636144578313253,
1392
+ "grad_norm": 2.078129291534424,
1393
+ "learning_rate": 3.0926111533235526e-06,
1394
+ "loss": 0.3417,
1395
+ "step": 198
1396
+ },
1397
+ {
1398
+ "epoch": 0.6393574297188755,
1399
+ "grad_norm": 2.0340514183044434,
1400
+ "learning_rate": 3.044476397549221e-06,
1401
+ "loss": 0.2881,
1402
+ "step": 199
1403
+ },
1404
+ {
1405
+ "epoch": 0.642570281124498,
1406
+ "grad_norm": 2.3322885036468506,
1407
+ "learning_rate": 2.9965546644694287e-06,
1408
+ "loss": 0.2418,
1409
+ "step": 200
1410
+ },
1411
+ {
1412
+ "epoch": 0.6457831325301204,
1413
+ "grad_norm": 2.1144306659698486,
1414
+ "learning_rate": 2.948851174382565e-06,
1415
+ "loss": 0.2758,
1416
+ "step": 201
1417
+ },
1418
+ {
1419
+ "epoch": 0.648995983935743,
1420
+ "grad_norm": 2.089343309402466,
1421
+ "learning_rate": 2.9013711238129693e-06,
1422
+ "loss": 0.3059,
1423
+ "step": 202
1424
+ },
1425
+ {
1426
+ "epoch": 0.6522088353413654,
1427
+ "grad_norm": 2.4294567108154297,
1428
+ "learning_rate": 2.8541196849448582e-06,
1429
+ "loss": 0.3653,
1430
+ "step": 203
1431
+ },
1432
+ {
1433
+ "epoch": 0.655421686746988,
1434
+ "grad_norm": 2.432567596435547,
1435
+ "learning_rate": 2.8071020050588927e-06,
1436
+ "loss": 0.2259,
1437
+ "step": 204
1438
+ },
1439
+ {
1440
+ "epoch": 0.6586345381526104,
1441
+ "grad_norm": 2.2744345664978027,
1442
+ "learning_rate": 2.760323205971476e-06,
1443
+ "loss": 0.3401,
1444
+ "step": 205
1445
+ },
1446
+ {
1447
+ "epoch": 0.661847389558233,
1448
+ "grad_norm": 2.8238182067871094,
1449
+ "learning_rate": 2.7137883834768076e-06,
1450
+ "loss": 0.2912,
1451
+ "step": 206
1452
+ },
1453
+ {
1454
+ "epoch": 0.6650602409638554,
1455
+ "grad_norm": 2.3627312183380127,
1456
+ "learning_rate": 2.6675026067917808e-06,
1457
+ "loss": 0.2491,
1458
+ "step": 207
1459
+ },
1460
+ {
1461
+ "epoch": 0.668273092369478,
1462
+ "grad_norm": 2.5185694694519043,
1463
+ "learning_rate": 2.621470918003768e-06,
1464
+ "loss": 0.2879,
1465
+ "step": 208
1466
+ },
1467
+ {
1468
+ "epoch": 0.6714859437751004,
1469
+ "grad_norm": 2.401811361312866,
1470
+ "learning_rate": 2.5756983315213748e-06,
1471
+ "loss": 0.3885,
1472
+ "step": 209
1473
+ },
1474
+ {
1475
+ "epoch": 0.6746987951807228,
1476
+ "grad_norm": 2.2231993675231934,
1477
+ "learning_rate": 2.5301898335281994e-06,
1478
+ "loss": 0.2833,
1479
+ "step": 210
1480
+ },
1481
+ {
1482
+ "epoch": 0.6779116465863454,
1483
+ "grad_norm": 2.2760813236236572,
1484
+ "learning_rate": 2.4849503814396624e-06,
1485
+ "loss": 0.2532,
1486
+ "step": 211
1487
+ },
1488
+ {
1489
+ "epoch": 0.6811244979919678,
1490
+ "grad_norm": 2.149466037750244,
1491
+ "learning_rate": 2.439984903362988e-06,
1492
+ "loss": 0.2805,
1493
+ "step": 212
1494
+ },
1495
+ {
1496
+ "epoch": 0.6843373493975904,
1497
+ "grad_norm": 2.3873984813690186,
1498
+ "learning_rate": 2.3952982975603494e-06,
1499
+ "loss": 0.3158,
1500
+ "step": 213
1501
+ },
1502
+ {
1503
+ "epoch": 0.6875502008032128,
1504
+ "grad_norm": 2.5702414512634277,
1505
+ "learning_rate": 2.3508954319153e-06,
1506
+ "loss": 0.3221,
1507
+ "step": 214
1508
+ },
1509
+ {
1510
+ "epoch": 0.6907630522088354,
1511
+ "grad_norm": 2.643507719039917,
1512
+ "learning_rate": 2.306781143402485e-06,
1513
+ "loss": 0.3544,
1514
+ "step": 215
1515
+ },
1516
+ {
1517
+ "epoch": 0.6939759036144578,
1518
+ "grad_norm": 2.201829671859741,
1519
+ "learning_rate": 2.2629602375607373e-06,
1520
+ "loss": 0.3529,
1521
+ "step": 216
1522
+ },
1523
+ {
1524
+ "epoch": 0.6971887550200804,
1525
+ "grad_norm": 2.1155354976654053,
1526
+ "learning_rate": 2.219437487969588e-06,
1527
+ "loss": 0.2572,
1528
+ "step": 217
1529
+ },
1530
+ {
1531
+ "epoch": 0.7004016064257028,
1532
+ "grad_norm": 2.038470506668091,
1533
+ "learning_rate": 2.1762176357292582e-06,
1534
+ "loss": 0.2295,
1535
+ "step": 218
1536
+ },
1537
+ {
1538
+ "epoch": 0.7036144578313253,
1539
+ "grad_norm": 2.023308753967285,
1540
+ "learning_rate": 2.1333053889442033e-06,
1541
+ "loss": 0.3592,
1542
+ "step": 219
1543
+ },
1544
+ {
1545
+ "epoch": 0.7068273092369478,
1546
+ "grad_norm": 2.5169191360473633,
1547
+ "learning_rate": 2.0907054222102367e-06,
1548
+ "loss": 0.3218,
1549
+ "step": 220
1550
+ },
1551
+ {
1552
+ "epoch": 0.7100401606425703,
1553
+ "grad_norm": 2.5444135665893555,
1554
+ "learning_rate": 2.048422376105299e-06,
1555
+ "loss": 0.3331,
1556
+ "step": 221
1557
+ },
1558
+ {
1559
+ "epoch": 0.7132530120481928,
1560
+ "grad_norm": 2.6340672969818115,
1561
+ "learning_rate": 2.0064608566839584e-06,
1562
+ "loss": 0.4002,
1563
+ "step": 222
1564
+ },
1565
+ {
1566
+ "epoch": 0.7164658634538152,
1567
+ "grad_norm": 2.0738048553466797,
1568
+ "learning_rate": 1.964825434975639e-06,
1569
+ "loss": 0.2908,
1570
+ "step": 223
1571
+ },
1572
+ {
1573
+ "epoch": 0.7196787148594378,
1574
+ "grad_norm": 2.0317463874816895,
1575
+ "learning_rate": 1.923520646486695e-06,
1576
+ "loss": 0.2293,
1577
+ "step": 224
1578
+ },
1579
+ {
1580
+ "epoch": 0.7228915662650602,
1581
+ "grad_norm": 2.2780821323394775,
1582
+ "learning_rate": 1.8825509907063328e-06,
1583
+ "loss": 0.2623,
1584
+ "step": 225
1585
+ },
1586
+ {
1587
+ "epoch": 0.7261044176706827,
1588
+ "grad_norm": 2.3251006603240967,
1589
+ "learning_rate": 1.8419209306164653e-06,
1590
+ "loss": 0.2582,
1591
+ "step": 226
1592
+ },
1593
+ {
1594
+ "epoch": 0.7293172690763052,
1595
+ "grad_norm": 2.357585906982422,
1596
+ "learning_rate": 1.8016348922055448e-06,
1597
+ "loss": 0.2822,
1598
+ "step": 227
1599
+ },
1600
+ {
1601
+ "epoch": 0.7325301204819277,
1602
+ "grad_norm": 2.6229782104492188,
1603
+ "learning_rate": 1.7616972639864166e-06,
1604
+ "loss": 0.3912,
1605
+ "step": 228
1606
+ },
1607
+ {
1608
+ "epoch": 0.7357429718875502,
1609
+ "grad_norm": 2.2909278869628906,
1610
+ "learning_rate": 1.7221123965182712e-06,
1611
+ "loss": 0.3465,
1612
+ "step": 229
1613
+ },
1614
+ {
1615
+ "epoch": 0.7389558232931727,
1616
+ "grad_norm": 1.9987268447875977,
1617
+ "learning_rate": 1.6828846019327128e-06,
1618
+ "loss": 0.3453,
1619
+ "step": 230
1620
+ },
1621
+ {
1622
+ "epoch": 0.7421686746987952,
1623
+ "grad_norm": 2.4425132274627686,
1624
+ "learning_rate": 1.6440181534640277e-06,
1625
+ "loss": 0.2378,
1626
+ "step": 231
1627
+ },
1628
+ {
1629
+ "epoch": 0.7453815261044177,
1630
+ "grad_norm": 1.847144365310669,
1631
+ "learning_rate": 1.6055172849836826e-06,
1632
+ "loss": 0.2161,
1633
+ "step": 232
1634
+ },
1635
+ {
1636
+ "epoch": 0.7485943775100402,
1637
+ "grad_norm": 2.3810346126556396,
1638
+ "learning_rate": 1.567386190539107e-06,
1639
+ "loss": 0.2798,
1640
+ "step": 233
1641
+ },
1642
+ {
1643
+ "epoch": 0.7518072289156627,
1644
+ "grad_norm": 1.8688887357711792,
1645
+ "learning_rate": 1.5296290238968303e-06,
1646
+ "loss": 0.2168,
1647
+ "step": 234
1648
+ },
1649
+ {
1650
+ "epoch": 0.7550200803212851,
1651
+ "grad_norm": 3.1742520332336426,
1652
+ "learning_rate": 1.4922498980899907e-06,
1653
+ "loss": 0.3661,
1654
+ "step": 235
1655
+ },
1656
+ {
1657
+ "epoch": 0.7582329317269076,
1658
+ "grad_norm": 2.355210542678833,
1659
+ "learning_rate": 1.4552528849702852e-06,
1660
+ "loss": 0.2874,
1661
+ "step": 236
1662
+ },
1663
+ {
1664
+ "epoch": 0.7614457831325301,
1665
+ "grad_norm": 3.493511915206909,
1666
+ "learning_rate": 1.4186420147644053e-06,
1667
+ "loss": 0.3802,
1668
+ "step": 237
1669
+ },
1670
+ {
1671
+ "epoch": 0.7646586345381526,
1672
+ "grad_norm": 1.8001739978790283,
1673
+ "learning_rate": 1.3824212756350196e-06,
1674
+ "loss": 0.2334,
1675
+ "step": 238
1676
+ },
1677
+ {
1678
+ "epoch": 0.7678714859437751,
1679
+ "grad_norm": 1.9317985773086548,
1680
+ "learning_rate": 1.3465946132463125e-06,
1681
+ "loss": 0.245,
1682
+ "step": 239
1683
+ },
1684
+ {
1685
+ "epoch": 0.7710843373493976,
1686
+ "grad_norm": 2.0644335746765137,
1687
+ "learning_rate": 1.3111659303341824e-06,
1688
+ "loss": 0.243,
1689
+ "step": 240
1690
+ },
1691
+ {
1692
+ "epoch": 0.7742971887550201,
1693
+ "grad_norm": 2.091796398162842,
1694
+ "learning_rate": 1.2761390862810907e-06,
1695
+ "loss": 0.2382,
1696
+ "step": 241
1697
+ },
1698
+ {
1699
+ "epoch": 0.7775100401606426,
1700
+ "grad_norm": 2.3172249794006348,
1701
+ "learning_rate": 1.2415178966956531e-06,
1702
+ "loss": 0.2797,
1703
+ "step": 242
1704
+ },
1705
+ {
1706
+ "epoch": 0.7807228915662651,
1707
+ "grad_norm": 2.269488573074341,
1708
+ "learning_rate": 1.2073061329969843e-06,
1709
+ "loss": 0.3524,
1710
+ "step": 243
1711
+ },
1712
+ {
1713
+ "epoch": 0.7839357429718875,
1714
+ "grad_norm": 2.3127260208129883,
1715
+ "learning_rate": 1.1735075220038634e-06,
1716
+ "loss": 0.2378,
1717
+ "step": 244
1718
+ },
1719
+ {
1720
+ "epoch": 0.7871485943775101,
1721
+ "grad_norm": 1.958678126335144,
1722
+ "learning_rate": 1.1401257455287612e-06,
1723
+ "loss": 0.2539,
1724
+ "step": 245
1725
+ },
1726
+ {
1727
+ "epoch": 0.7903614457831325,
1728
+ "grad_norm": 2.530526876449585,
1729
+ "learning_rate": 1.107164439976764e-06,
1730
+ "loss": 0.2905,
1731
+ "step": 246
1732
+ },
1733
+ {
1734
+ "epoch": 0.793574297188755,
1735
+ "grad_norm": 2.522650957107544,
1736
+ "learning_rate": 1.0746271959494453e-06,
1737
+ "loss": 0.3653,
1738
+ "step": 247
1739
+ },
1740
+ {
1741
+ "epoch": 0.7967871485943775,
1742
+ "grad_norm": 2.9361941814422607,
1743
+ "learning_rate": 1.04251755785373e-06,
1744
+ "loss": 0.3054,
1745
+ "step": 248
1746
+ },
1747
+ {
1748
+ "epoch": 0.8,
1749
+ "grad_norm": 2.346240758895874,
1750
+ "learning_rate": 1.0108390235157828e-06,
1751
+ "loss": 0.2623,
1752
+ "step": 249
1753
+ },
1754
+ {
1755
+ "epoch": 0.8032128514056225,
1756
+ "grad_norm": 2.452631711959839,
1757
+ "learning_rate": 9.795950437999852e-07,
1758
+ "loss": 0.2113,
1759
+ "step": 250
1760
+ },
1761
+ {
1762
+ "epoch": 0.8064257028112449,
1763
+ "grad_norm": 2.9312245845794678,
1764
+ "learning_rate": 9.487890222330137e-07,
1765
+ "loss": 0.4234,
1766
+ "step": 251
1767
+ },
1768
+ {
1769
+ "epoch": 0.8096385542168675,
1770
+ "grad_norm": 2.1248555183410645,
1771
+ "learning_rate": 9.184243146330829e-07,
1772
+ "loss": 0.2872,
1773
+ "step": 252
1774
+ },
1775
+ {
1776
+ "epoch": 0.8128514056224899,
1777
+ "grad_norm": 2.393862247467041,
1778
+ "learning_rate": 8.885042287443785e-07,
1779
+ "loss": 0.2849,
1780
+ "step": 253
1781
+ },
1782
+ {
1783
+ "epoch": 0.8160642570281125,
1784
+ "grad_norm": 2.453737258911133,
1785
+ "learning_rate": 8.590320238767425e-07,
1786
+ "loss": 0.2962,
1787
+ "step": 254
1788
+ },
1789
+ {
1790
+ "epoch": 0.8192771084337349,
1791
+ "grad_norm": 2.9770538806915283,
1792
+ "learning_rate": 8.30010910550611e-07,
1793
+ "loss": 0.3318,
1794
+ "step": 255
1795
+ },
1796
+ {
1797
+ "epoch": 0.8224899598393575,
1798
+ "grad_norm": 1.8912490606307983,
1799
+ "learning_rate": 8.014440501472909e-07,
1800
+ "loss": 0.2446,
1801
+ "step": 256
1802
+ },
1803
+ {
1804
+ "epoch": 0.8257028112449799,
1805
+ "grad_norm": 3.3345978260040283,
1806
+ "learning_rate": 7.733345545645726e-07,
1807
+ "loss": 0.3895,
1808
+ "step": 257
1809
+ },
1810
+ {
1811
+ "epoch": 0.8289156626506025,
1812
+ "grad_norm": 2.2788500785827637,
1813
+ "learning_rate": 7.456854858777418e-07,
1814
+ "loss": 0.2767,
1815
+ "step": 258
1816
+ },
1817
+ {
1818
+ "epoch": 0.8321285140562249,
1819
+ "grad_norm": 2.039923906326294,
1820
+ "learning_rate": 7.184998560060114e-07,
1821
+ "loss": 0.1987,
1822
+ "step": 259
1823
+ },
1824
+ {
1825
+ "epoch": 0.8353413654618473,
1826
+ "grad_norm": 2.3319668769836426,
1827
+ "learning_rate": 6.917806263844268e-07,
1828
+ "loss": 0.2265,
1829
+ "step": 260
1830
+ },
1831
+ {
1832
+ "epoch": 0.8385542168674699,
1833
+ "grad_norm": 3.085174560546875,
1834
+ "learning_rate": 6.655307076412637e-07,
1835
+ "loss": 0.3313,
1836
+ "step": 261
1837
+ },
1838
+ {
1839
+ "epoch": 0.8417670682730923,
1840
+ "grad_norm": 1.9811396598815918,
1841
+ "learning_rate": 6.397529592809615e-07,
1842
+ "loss": 0.2447,
1843
+ "step": 262
1844
+ },
1845
+ {
1846
+ "epoch": 0.8449799196787149,
1847
+ "grad_norm": 1.8202673196792603,
1848
+ "learning_rate": 6.14450189372628e-07,
1849
+ "loss": 0.2427,
1850
+ "step": 263
1851
+ },
1852
+ {
1853
+ "epoch": 0.8481927710843373,
1854
+ "grad_norm": 2.299744129180908,
1855
+ "learning_rate": 5.896251542441395e-07,
1856
+ "loss": 0.2412,
1857
+ "step": 264
1858
+ },
1859
+ {
1860
+ "epoch": 0.8514056224899599,
1861
+ "grad_norm": 2.096524953842163,
1862
+ "learning_rate": 5.652805581818943e-07,
1863
+ "loss": 0.2329,
1864
+ "step": 265
1865
+ },
1866
+ {
1867
+ "epoch": 0.8546184738955823,
1868
+ "grad_norm": 2.4148600101470947,
1869
+ "learning_rate": 5.414190531362162e-07,
1870
+ "loss": 0.2183,
1871
+ "step": 266
1872
+ },
1873
+ {
1874
+ "epoch": 0.8578313253012049,
1875
+ "grad_norm": 1.961333155632019,
1876
+ "learning_rate": 5.180432384324691e-07,
1877
+ "loss": 0.2278,
1878
+ "step": 267
1879
+ },
1880
+ {
1881
+ "epoch": 0.8610441767068273,
1882
+ "grad_norm": 1.9496490955352783,
1883
+ "learning_rate": 4.951556604879049e-07,
1884
+ "loss": 0.1662,
1885
+ "step": 268
1886
+ },
1887
+ {
1888
+ "epoch": 0.8642570281124498,
1889
+ "grad_norm": 2.195952892303467,
1890
+ "learning_rate": 4.727588125342669e-07,
1891
+ "loss": 0.2645,
1892
+ "step": 269
1893
+ },
1894
+ {
1895
+ "epoch": 0.8674698795180723,
1896
+ "grad_norm": 1.8171138763427734,
1897
+ "learning_rate": 4.508551343462014e-07,
1898
+ "loss": 0.1446,
1899
+ "step": 270
1900
+ },
1901
+ {
1902
+ "epoch": 0.8706827309236947,
1903
+ "grad_norm": 2.3251218795776367,
1904
+ "learning_rate": 4.29447011975474e-07,
1905
+ "loss": 0.2679,
1906
+ "step": 271
1907
+ },
1908
+ {
1909
+ "epoch": 0.8738955823293173,
1910
+ "grad_norm": 2.06396746635437,
1911
+ "learning_rate": 4.0853677749105426e-07,
1912
+ "loss": 0.2793,
1913
+ "step": 272
1914
+ },
1915
+ {
1916
+ "epoch": 0.8771084337349397,
1917
+ "grad_norm": 2.0605921745300293,
1918
+ "learning_rate": 3.8812670872507454e-07,
1919
+ "loss": 0.2798,
1920
+ "step": 273
1921
+ },
1922
+ {
1923
+ "epoch": 0.8803212851405623,
1924
+ "grad_norm": 2.0419671535491943,
1925
+ "learning_rate": 3.6821902902469066e-07,
1926
+ "loss": 0.3494,
1927
+ "step": 274
1928
+ },
1929
+ {
1930
+ "epoch": 0.8835341365461847,
1931
+ "grad_norm": 1.9494353532791138,
1932
+ "learning_rate": 3.4881590700989175e-07,
1933
+ "loss": 0.3164,
1934
+ "step": 275
1935
+ },
1936
+ {
1937
+ "epoch": 0.8867469879518072,
1938
+ "grad_norm": 2.2515294551849365,
1939
+ "learning_rate": 3.299194563372604e-07,
1940
+ "loss": 0.2364,
1941
+ "step": 276
1942
+ },
1943
+ {
1944
+ "epoch": 0.8899598393574297,
1945
+ "grad_norm": 2.4932332038879395,
1946
+ "learning_rate": 3.1153173546972395e-07,
1947
+ "loss": 0.1982,
1948
+ "step": 277
1949
+ },
1950
+ {
1951
+ "epoch": 0.8931726907630522,
1952
+ "grad_norm": 1.9155004024505615,
1953
+ "learning_rate": 2.9365474745231935e-07,
1954
+ "loss": 0.2864,
1955
+ "step": 278
1956
+ },
1957
+ {
1958
+ "epoch": 0.8963855421686747,
1959
+ "grad_norm": 2.346914529800415,
1960
+ "learning_rate": 2.7629043969399193e-07,
1961
+ "loss": 0.3323,
1962
+ "step": 279
1963
+ },
1964
+ {
1965
+ "epoch": 0.8995983935742972,
1966
+ "grad_norm": 1.957244634628296,
1967
+ "learning_rate": 2.594407037554586e-07,
1968
+ "loss": 0.1917,
1969
+ "step": 280
1970
+ },
1971
+ {
1972
+ "epoch": 0.9028112449799197,
1973
+ "grad_norm": 2.4275577068328857,
1974
+ "learning_rate": 2.431073751431529e-07,
1975
+ "loss": 0.3431,
1976
+ "step": 281
1977
+ },
1978
+ {
1979
+ "epoch": 0.9060240963855422,
1980
+ "grad_norm": 2.3854384422302246,
1981
+ "learning_rate": 2.2729223310927473e-07,
1982
+ "loss": 0.1988,
1983
+ "step": 282
1984
+ },
1985
+ {
1986
+ "epoch": 0.9092369477911647,
1987
+ "grad_norm": 2.0022900104522705,
1988
+ "learning_rate": 2.1199700045797077e-07,
1989
+ "loss": 0.2806,
1990
+ "step": 283
1991
+ },
1992
+ {
1993
+ "epoch": 0.9124497991967871,
1994
+ "grad_norm": 2.0023062229156494,
1995
+ "learning_rate": 1.9722334335766092e-07,
1996
+ "loss": 0.3024,
1997
+ "step": 284
1998
+ },
1999
+ {
2000
+ "epoch": 0.9156626506024096,
2001
+ "grad_norm": 1.8861989974975586,
2002
+ "learning_rate": 1.829728711595391e-07,
2003
+ "loss": 0.2559,
2004
+ "step": 285
2005
+ },
2006
+ {
2007
+ "epoch": 0.9188755020080321,
2008
+ "grad_norm": 2.1966798305511475,
2009
+ "learning_rate": 1.6924713622225975e-07,
2010
+ "loss": 0.2724,
2011
+ "step": 286
2012
+ },
2013
+ {
2014
+ "epoch": 0.9220883534136546,
2015
+ "grad_norm": 1.6553095579147339,
2016
+ "learning_rate": 1.5604763374283073e-07,
2017
+ "loss": 0.1017,
2018
+ "step": 287
2019
+ },
2020
+ {
2021
+ "epoch": 0.9253012048192771,
2022
+ "grad_norm": 2.2511794567108154,
2023
+ "learning_rate": 1.4337580159373864e-07,
2024
+ "loss": 0.2722,
2025
+ "step": 288
2026
+ },
2027
+ {
2028
+ "epoch": 0.9285140562248996,
2029
+ "grad_norm": 2.169060230255127,
2030
+ "learning_rate": 1.3123302016631477e-07,
2031
+ "loss": 0.265,
2032
+ "step": 289
2033
+ },
2034
+ {
2035
+ "epoch": 0.9317269076305221,
2036
+ "grad_norm": 2.9961092472076416,
2037
+ "learning_rate": 1.196206122203647e-07,
2038
+ "loss": 0.3931,
2039
+ "step": 290
2040
+ },
2041
+ {
2042
+ "epoch": 0.9349397590361446,
2043
+ "grad_norm": 2.435908794403076,
2044
+ "learning_rate": 1.0853984274007246e-07,
2045
+ "loss": 0.2906,
2046
+ "step": 291
2047
+ },
2048
+ {
2049
+ "epoch": 0.9381526104417671,
2050
+ "grad_norm": 2.8427674770355225,
2051
+ "learning_rate": 9.799191879620474e-08,
2052
+ "loss": 0.2227,
2053
+ "step": 292
2054
+ },
2055
+ {
2056
+ "epoch": 0.9413654618473896,
2057
+ "grad_norm": 2.3079280853271484,
2058
+ "learning_rate": 8.797798941461655e-08,
2059
+ "loss": 0.2599,
2060
+ "step": 293
2061
+ },
2062
+ {
2063
+ "epoch": 0.944578313253012,
2064
+ "grad_norm": 2.447566032409668,
2065
+ "learning_rate": 7.84991454510864e-08,
2066
+ "loss": 0.25,
2067
+ "step": 294
2068
+ },
2069
+ {
2070
+ "epoch": 0.9477911646586346,
2071
+ "grad_norm": 2.5724740028381348,
2072
+ "learning_rate": 6.955641947248127e-08,
2073
+ "loss": 0.236,
2074
+ "step": 295
2075
+ },
2076
+ {
2077
+ "epoch": 0.951004016064257,
2078
+ "grad_norm": 2.4259979724884033,
2079
+ "learning_rate": 6.115078564427946e-08,
2080
+ "loss": 0.2004,
2081
+ "step": 296
2082
+ },
2083
+ {
2084
+ "epoch": 0.9542168674698795,
2085
+ "grad_norm": 1.9986400604248047,
2086
+ "learning_rate": 5.3283159624448745e-08,
2087
+ "loss": 0.2437,
2088
+ "step": 297
2089
+ },
2090
+ {
2091
+ "epoch": 0.957429718875502,
2092
+ "grad_norm": 2.7118661403656006,
2093
+ "learning_rate": 4.5954398463700647e-08,
2094
+ "loss": 0.4067,
2095
+ "step": 298
2096
+ },
2097
+ {
2098
+ "epoch": 0.9606425702811245,
2099
+ "grad_norm": 2.030395984649658,
2100
+ "learning_rate": 3.916530051212841e-08,
2101
+ "loss": 0.2347,
2102
+ "step": 299
2103
+ },
2104
+ {
2105
+ "epoch": 0.963855421686747,
2106
+ "grad_norm": 1.5845026969909668,
2107
+ "learning_rate": 3.2916605332238284e-08,
2108
+ "loss": 0.153,
2109
+ "step": 300
2110
+ },
2111
+ {
2112
+ "epoch": 0.9670682730923694,
2113
+ "grad_norm": 2.0269935131073,
2114
+ "learning_rate": 2.7208993618390578e-08,
2115
+ "loss": 0.1627,
2116
+ "step": 301
2117
+ },
2118
+ {
2119
+ "epoch": 0.970281124497992,
2120
+ "grad_norm": 1.7263789176940918,
2121
+ "learning_rate": 2.2043087122644023e-08,
2122
+ "loss": 0.1601,
2123
+ "step": 302
2124
+ },
2125
+ {
2126
+ "epoch": 0.9734939759036144,
2127
+ "grad_norm": 1.7540363073349,
2128
+ "learning_rate": 1.741944858702771e-08,
2129
+ "loss": 0.2004,
2130
+ "step": 303
2131
+ },
2132
+ {
2133
+ "epoch": 0.976706827309237,
2134
+ "grad_norm": 2.1071643829345703,
2135
+ "learning_rate": 1.333858168224178e-08,
2136
+ "loss": 0.2089,
2137
+ "step": 304
2138
+ },
2139
+ {
2140
+ "epoch": 0.9799196787148594,
2141
+ "grad_norm": 2.682478904724121,
2142
+ "learning_rate": 9.800930952786336e-09,
2143
+ "loss": 0.3269,
2144
+ "step": 305
2145
+ },
2146
+ {
2147
+ "epoch": 0.983132530120482,
2148
+ "grad_norm": 2.5605387687683105,
2149
+ "learning_rate": 6.806881768539053e-09,
2150
+ "loss": 0.2146,
2151
+ "step": 306
2152
+ },
2153
+ {
2154
+ "epoch": 0.9863453815261044,
2155
+ "grad_norm": 1.8436514139175415,
2156
+ "learning_rate": 4.356760282773209e-09,
2157
+ "loss": 0.2533,
2158
+ "step": 307
2159
+ },
2160
+ {
2161
+ "epoch": 0.989558232931727,
2162
+ "grad_norm": 2.426079034805298,
2163
+ "learning_rate": 2.4508333966305473e-09,
2164
+ "loss": 0.3066,
2165
+ "step": 308
2166
+ },
2167
+ {
2168
+ "epoch": 0.9927710843373494,
2169
+ "grad_norm": 2.5514280796051025,
2170
+ "learning_rate": 1.089308730043981e-09,
2171
+ "loss": 0.2522,
2172
+ "step": 309
2173
+ },
2174
+ {
2175
+ "epoch": 0.9959839357429718,
2176
+ "grad_norm": 3.108274221420288,
2177
+ "learning_rate": 2.723345991245685e-10,
2178
+ "loss": 0.3181,
2179
+ "step": 310
2180
+ },
2181
+ {
2182
+ "epoch": 0.9991967871485944,
2183
+ "grad_norm": 1.9873707294464111,
2184
+ "learning_rate": 0.0,
2185
+ "loss": 0.2276,
2186
+ "step": 311
2187
+ },
2188
+ {
2189
+ "epoch": 0.9991967871485944,
2190
+ "step": 311,
2191
+ "total_flos": 9.654917969372774e+16,
2192
+ "train_loss": 0.38514137028497897,
2193
+ "train_runtime": 6718.4685,
2194
+ "train_samples_per_second": 1.483,
2195
+ "train_steps_per_second": 0.046
2196
+ }
2197
+ ],
2198
+ "logging_steps": 1.0,
2199
+ "max_steps": 311,
2200
+ "num_input_tokens_seen": 0,
2201
+ "num_train_epochs": 1,
2202
+ "save_steps": 500,
2203
+ "stateful_callbacks": {
2204
+ "TrainerControl": {
2205
+ "args": {
2206
+ "should_epoch_stop": false,
2207
+ "should_evaluate": false,
2208
+ "should_log": false,
2209
+ "should_save": true,
2210
+ "should_training_stop": true
2211
+ },
2212
+ "attributes": {}
2213
+ }
2214
+ },
2215
+ "total_flos": 9.654917969372774e+16,
2216
+ "train_batch_size": 1,
2217
+ "trial_name": null,
2218
+ "trial_params": null
2219
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba13fafd79b52a28f4ad9217a1e459b2364c55c4322937c73e141ed5407b98de
3
+ size 7352
vocab.json ADDED
The diff for this file is too large to render. See raw diff