|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9963008631319359, |
|
"eval_steps": 500, |
|
"global_step": 101, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009864364981504316, |
|
"grad_norm": 0.7294987440109253, |
|
"learning_rate": 4.998790705729971e-05, |
|
"loss": 1.0213, |
|
"num_input_tokens_seen": 2097152, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01972872996300863, |
|
"grad_norm": 0.5664029717445374, |
|
"learning_rate": 4.995163992833986e-05, |
|
"loss": 1.0136, |
|
"num_input_tokens_seen": 4194304, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.029593094944512947, |
|
"grad_norm": 0.42022261023521423, |
|
"learning_rate": 4.989123369922547e-05, |
|
"loss": 0.9404, |
|
"num_input_tokens_seen": 6291456, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03945745992601726, |
|
"grad_norm": 0.3317222595214844, |
|
"learning_rate": 4.980674680908192e-05, |
|
"loss": 0.953, |
|
"num_input_tokens_seen": 8388608, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04932182490752158, |
|
"grad_norm": 0.2700234651565552, |
|
"learning_rate": 4.969826099351892e-05, |
|
"loss": 0.9453, |
|
"num_input_tokens_seen": 10485760, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.059186189889025895, |
|
"grad_norm": 0.23365087807178497, |
|
"learning_rate": 4.9565881205556594e-05, |
|
"loss": 0.9111, |
|
"num_input_tokens_seen": 12582912, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0690505548705302, |
|
"grad_norm": 0.17710435390472412, |
|
"learning_rate": 4.940973551409018e-05, |
|
"loss": 0.878, |
|
"num_input_tokens_seen": 14680064, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.07891491985203453, |
|
"grad_norm": 0.17090857028961182, |
|
"learning_rate": 4.922997497999166e-05, |
|
"loss": 0.914, |
|
"num_input_tokens_seen": 16777216, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08877928483353884, |
|
"grad_norm": 0.18688030540943146, |
|
"learning_rate": 4.9026773509968115e-05, |
|
"loss": 0.8649, |
|
"num_input_tokens_seen": 18874368, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09864364981504316, |
|
"grad_norm": 0.19690372049808502, |
|
"learning_rate": 4.8800327688318246e-05, |
|
"loss": 0.9029, |
|
"num_input_tokens_seen": 20971520, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10850801479654747, |
|
"grad_norm": 0.16644370555877686, |
|
"learning_rate": 4.855085658674973e-05, |
|
"loss": 0.882, |
|
"num_input_tokens_seen": 23068672, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.11837237977805179, |
|
"grad_norm": 0.1807110756635666, |
|
"learning_rate": 4.827860155244149e-05, |
|
"loss": 0.8438, |
|
"num_input_tokens_seen": 25165824, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1282367447595561, |
|
"grad_norm": 0.16156087815761566, |
|
"learning_rate": 4.798382597455591e-05, |
|
"loss": 0.8743, |
|
"num_input_tokens_seen": 27262976, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.1381011097410604, |
|
"grad_norm": 0.14602774381637573, |
|
"learning_rate": 4.7666815029426816e-05, |
|
"loss": 0.8736, |
|
"num_input_tokens_seen": 29360128, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.14796547472256474, |
|
"grad_norm": 0.15224798023700714, |
|
"learning_rate": 4.732787540466979e-05, |
|
"loss": 0.8687, |
|
"num_input_tokens_seen": 31457280, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.15782983970406905, |
|
"grad_norm": 0.14535664021968842, |
|
"learning_rate": 4.696733500248172e-05, |
|
"loss": 0.864, |
|
"num_input_tokens_seen": 33554432, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.16769420468557336, |
|
"grad_norm": 0.1372939944267273, |
|
"learning_rate": 4.658554262241659e-05, |
|
"loss": 0.8877, |
|
"num_input_tokens_seen": 35651584, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.17755856966707767, |
|
"grad_norm": 0.1264754980802536, |
|
"learning_rate": 4.6182867623944436e-05, |
|
"loss": 0.8749, |
|
"num_input_tokens_seen": 37748736, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.187422934648582, |
|
"grad_norm": 0.11968766152858734, |
|
"learning_rate": 4.575969956911994e-05, |
|
"loss": 0.8338, |
|
"num_input_tokens_seen": 39845888, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.19728729963008632, |
|
"grad_norm": 0.1143808588385582, |
|
"learning_rate": 4.531644784570626e-05, |
|
"loss": 0.8294, |
|
"num_input_tokens_seen": 41943040, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.20715166461159062, |
|
"grad_norm": 0.12028194963932037, |
|
"learning_rate": 4.485354127111884e-05, |
|
"loss": 0.8666, |
|
"num_input_tokens_seen": 44040192, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.21701602959309493, |
|
"grad_norm": 0.11237838119268417, |
|
"learning_rate": 4.437142767757225e-05, |
|
"loss": 0.8038, |
|
"num_input_tokens_seen": 46137344, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.22688039457459927, |
|
"grad_norm": 0.1187497228384018, |
|
"learning_rate": 4.387057347883143e-05, |
|
"loss": 0.8492, |
|
"num_input_tokens_seen": 48234496, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.23674475955610358, |
|
"grad_norm": 0.11182423681020737, |
|
"learning_rate": 4.335146321898651e-05, |
|
"loss": 0.8047, |
|
"num_input_tokens_seen": 50331648, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2466091245376079, |
|
"grad_norm": 0.118191197514534, |
|
"learning_rate": 4.281459910368768e-05, |
|
"loss": 0.8521, |
|
"num_input_tokens_seen": 52428800, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2564734895191122, |
|
"grad_norm": 0.09803211688995361, |
|
"learning_rate": 4.226050051429367e-05, |
|
"loss": 0.8133, |
|
"num_input_tokens_seen": 54525952, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.26633785450061653, |
|
"grad_norm": 0.10156495869159698, |
|
"learning_rate": 4.168970350540384e-05, |
|
"loss": 0.8365, |
|
"num_input_tokens_seen": 56623104, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.2762022194821208, |
|
"grad_norm": 0.10211872309446335, |
|
"learning_rate": 4.110276028625995e-05, |
|
"loss": 0.8094, |
|
"num_input_tokens_seen": 58720256, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.28606658446362515, |
|
"grad_norm": 0.10365650057792664, |
|
"learning_rate": 4.050023868651938e-05, |
|
"loss": 0.8174, |
|
"num_input_tokens_seen": 60817408, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.2959309494451295, |
|
"grad_norm": 0.09515471011400223, |
|
"learning_rate": 3.988272160691665e-05, |
|
"loss": 0.8278, |
|
"num_input_tokens_seen": 62914560, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.30579531442663377, |
|
"grad_norm": 0.10051083564758301, |
|
"learning_rate": 3.925080645534457e-05, |
|
"loss": 0.8425, |
|
"num_input_tokens_seen": 65011712, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.3156596794081381, |
|
"grad_norm": 0.0998254269361496, |
|
"learning_rate": 3.8605104568900685e-05, |
|
"loss": 0.8145, |
|
"num_input_tokens_seen": 67108864, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.32552404438964244, |
|
"grad_norm": 0.109571672976017, |
|
"learning_rate": 3.7946240622458135e-05, |
|
"loss": 0.8322, |
|
"num_input_tokens_seen": 69206016, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3353884093711467, |
|
"grad_norm": 0.09529263526201248, |
|
"learning_rate": 3.7274852024333054e-05, |
|
"loss": 0.8114, |
|
"num_input_tokens_seen": 71303168, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.34525277435265106, |
|
"grad_norm": 0.09562862664461136, |
|
"learning_rate": 3.6591588299633186e-05, |
|
"loss": 0.8091, |
|
"num_input_tokens_seen": 73400320, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.35511713933415534, |
|
"grad_norm": 0.09187028557062149, |
|
"learning_rate": 3.589711046188428e-05, |
|
"loss": 0.7743, |
|
"num_input_tokens_seen": 75497472, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3649815043156597, |
|
"grad_norm": 0.09879715740680695, |
|
"learning_rate": 3.519209037354222e-05, |
|
"loss": 0.8368, |
|
"num_input_tokens_seen": 77594624, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.374845869297164, |
|
"grad_norm": 0.0839247927069664, |
|
"learning_rate": 3.447721009600949e-05, |
|
"loss": 0.8177, |
|
"num_input_tokens_seen": 79691776, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3847102342786683, |
|
"grad_norm": 0.09094128012657166, |
|
"learning_rate": 3.3753161229784766e-05, |
|
"loss": 0.8109, |
|
"num_input_tokens_seen": 81788928, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.39457459926017263, |
|
"grad_norm": 0.09645260125398636, |
|
"learning_rate": 3.302064424538419e-05, |
|
"loss": 0.827, |
|
"num_input_tokens_seen": 83886080, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.40443896424167697, |
|
"grad_norm": 0.08931674808263779, |
|
"learning_rate": 3.228036780568131e-05, |
|
"loss": 0.8167, |
|
"num_input_tokens_seen": 85983232, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.41430332922318125, |
|
"grad_norm": 0.08245021849870682, |
|
"learning_rate": 3.153304808032152e-05, |
|
"loss": 0.8073, |
|
"num_input_tokens_seen": 88080384, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.4241676942046856, |
|
"grad_norm": 0.08803369104862213, |
|
"learning_rate": 3.077940805287425e-05, |
|
"loss": 0.7793, |
|
"num_input_tokens_seen": 90177536, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.43403205918618987, |
|
"grad_norm": 0.08340983092784882, |
|
"learning_rate": 3.0020176821392964e-05, |
|
"loss": 0.8096, |
|
"num_input_tokens_seen": 92274688, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.4438964241676942, |
|
"grad_norm": 0.09057221561670303, |
|
"learning_rate": 2.925608889305997e-05, |
|
"loss": 0.8212, |
|
"num_input_tokens_seen": 94371840, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.45376078914919854, |
|
"grad_norm": 0.08546467870473862, |
|
"learning_rate": 2.848788347359808e-05, |
|
"loss": 0.8151, |
|
"num_input_tokens_seen": 96468992, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4636251541307028, |
|
"grad_norm": 0.08286646008491516, |
|
"learning_rate": 2.7716303752136864e-05, |
|
"loss": 0.8331, |
|
"num_input_tokens_seen": 98566144, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.47348951911220716, |
|
"grad_norm": 0.08105745166540146, |
|
"learning_rate": 2.6942096182225162e-05, |
|
"loss": 0.8003, |
|
"num_input_tokens_seen": 100663296, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.4833538840937115, |
|
"grad_norm": 0.07993409782648087, |
|
"learning_rate": 2.616600975968544e-05, |
|
"loss": 0.8214, |
|
"num_input_tokens_seen": 102760448, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4932182490752158, |
|
"grad_norm": 0.08382619917392731, |
|
"learning_rate": 2.5388795298008776e-05, |
|
"loss": 0.8118, |
|
"num_input_tokens_seen": 104857600, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5030826140567201, |
|
"grad_norm": 0.08271751552820206, |
|
"learning_rate": 2.4611204701991227e-05, |
|
"loss": 0.8382, |
|
"num_input_tokens_seen": 106954752, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5129469790382244, |
|
"grad_norm": 0.07375568896532059, |
|
"learning_rate": 2.3833990240314562e-05, |
|
"loss": 0.8099, |
|
"num_input_tokens_seen": 109051904, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5228113440197287, |
|
"grad_norm": 0.07492537051439285, |
|
"learning_rate": 2.3057903817774843e-05, |
|
"loss": 0.7914, |
|
"num_input_tokens_seen": 111149056, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5326757090012331, |
|
"grad_norm": 0.07812082767486572, |
|
"learning_rate": 2.2283696247863135e-05, |
|
"loss": 0.8104, |
|
"num_input_tokens_seen": 113246208, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5425400739827374, |
|
"grad_norm": 0.07944825291633606, |
|
"learning_rate": 2.1512116526401928e-05, |
|
"loss": 0.8125, |
|
"num_input_tokens_seen": 115343360, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5524044389642416, |
|
"grad_norm": 0.07352437824010849, |
|
"learning_rate": 2.0743911106940034e-05, |
|
"loss": 0.8198, |
|
"num_input_tokens_seen": 117440512, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.562268803945746, |
|
"grad_norm": 0.07700145244598389, |
|
"learning_rate": 1.9979823178607042e-05, |
|
"loss": 0.8019, |
|
"num_input_tokens_seen": 119537664, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5721331689272503, |
|
"grad_norm": 0.07811249792575836, |
|
"learning_rate": 1.9220591947125766e-05, |
|
"loss": 0.8141, |
|
"num_input_tokens_seen": 121634816, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5819975339087546, |
|
"grad_norm": 0.07361660897731781, |
|
"learning_rate": 1.846695191967849e-05, |
|
"loss": 0.7985, |
|
"num_input_tokens_seen": 123731968, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.591861898890259, |
|
"grad_norm": 0.07933210581541061, |
|
"learning_rate": 1.7719632194318702e-05, |
|
"loss": 0.7998, |
|
"num_input_tokens_seen": 125829120, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6017262638717632, |
|
"grad_norm": 0.0801980197429657, |
|
"learning_rate": 1.6979355754615814e-05, |
|
"loss": 0.7988, |
|
"num_input_tokens_seen": 127926272, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.6115906288532675, |
|
"grad_norm": 0.07316846400499344, |
|
"learning_rate": 1.6246838770215233e-05, |
|
"loss": 0.8016, |
|
"num_input_tokens_seen": 130023424, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6214549938347719, |
|
"grad_norm": 0.0750172808766365, |
|
"learning_rate": 1.552278990399052e-05, |
|
"loss": 0.8162, |
|
"num_input_tokens_seen": 132120576, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6313193588162762, |
|
"grad_norm": 0.07336606830358505, |
|
"learning_rate": 1.4807909626457782e-05, |
|
"loss": 0.8258, |
|
"num_input_tokens_seen": 134217728, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6411837237977805, |
|
"grad_norm": 0.07929656654596329, |
|
"learning_rate": 1.4102889538115723e-05, |
|
"loss": 0.8063, |
|
"num_input_tokens_seen": 136314880, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6510480887792849, |
|
"grad_norm": 0.08207987248897552, |
|
"learning_rate": 1.3408411700366813e-05, |
|
"loss": 0.8116, |
|
"num_input_tokens_seen": 138412032, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6609124537607891, |
|
"grad_norm": 0.07338231056928635, |
|
"learning_rate": 1.2725147975666948e-05, |
|
"loss": 0.785, |
|
"num_input_tokens_seen": 140509184, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6707768187422934, |
|
"grad_norm": 0.08025672286748886, |
|
"learning_rate": 1.2053759377541866e-05, |
|
"loss": 0.8049, |
|
"num_input_tokens_seen": 142606336, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6806411837237978, |
|
"grad_norm": 0.07760636508464813, |
|
"learning_rate": 1.1394895431099314e-05, |
|
"loss": 0.8034, |
|
"num_input_tokens_seen": 144703488, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.6905055487053021, |
|
"grad_norm": 0.07995034009218216, |
|
"learning_rate": 1.0749193544655434e-05, |
|
"loss": 0.7949, |
|
"num_input_tokens_seen": 146800640, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7003699136868065, |
|
"grad_norm": 0.07639381289482117, |
|
"learning_rate": 1.0117278393083358e-05, |
|
"loss": 0.8024, |
|
"num_input_tokens_seen": 148897792, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.7102342786683107, |
|
"grad_norm": 0.0737844705581665, |
|
"learning_rate": 9.499761313480626e-06, |
|
"loss": 0.7608, |
|
"num_input_tokens_seen": 150994944, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.720098643649815, |
|
"grad_norm": 0.07084383070468903, |
|
"learning_rate": 8.897239713740058e-06, |
|
"loss": 0.8012, |
|
"num_input_tokens_seen": 153092096, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7299630086313194, |
|
"grad_norm": 0.07751645892858505, |
|
"learning_rate": 8.31029649459616e-06, |
|
"loss": 0.7688, |
|
"num_input_tokens_seen": 155189248, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7398273736128237, |
|
"grad_norm": 0.07183132320642471, |
|
"learning_rate": 7.739499485706334e-06, |
|
"loss": 0.8023, |
|
"num_input_tokens_seen": 157286400, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.749691738594328, |
|
"grad_norm": 0.07271506637334824, |
|
"learning_rate": 7.185400896312328e-06, |
|
"loss": 0.7809, |
|
"num_input_tokens_seen": 159383552, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7595561035758323, |
|
"grad_norm": 0.07422608137130737, |
|
"learning_rate": 6.648536781013495e-06, |
|
"loss": 0.8083, |
|
"num_input_tokens_seen": 161480704, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7694204685573366, |
|
"grad_norm": 0.07867728918790817, |
|
"learning_rate": 6.12942652116858e-06, |
|
"loss": 0.7903, |
|
"num_input_tokens_seen": 163577856, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7792848335388409, |
|
"grad_norm": 0.07830856740474701, |
|
"learning_rate": 5.628572322427755e-06, |
|
"loss": 0.7904, |
|
"num_input_tokens_seen": 165675008, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.7891491985203453, |
|
"grad_norm": 0.07836252450942993, |
|
"learning_rate": 5.1464587288811624e-06, |
|
"loss": 0.797, |
|
"num_input_tokens_seen": 167772160, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7990135635018496, |
|
"grad_norm": 0.07151419669389725, |
|
"learning_rate": 4.683552154293747e-06, |
|
"loss": 0.7636, |
|
"num_input_tokens_seen": 169869312, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.8088779284833539, |
|
"grad_norm": 0.07672585546970367, |
|
"learning_rate": 4.240300430880062e-06, |
|
"loss": 0.7818, |
|
"num_input_tokens_seen": 171966464, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.8187422934648582, |
|
"grad_norm": 0.08120947331190109, |
|
"learning_rate": 3.817132376055565e-06, |
|
"loss": 0.7914, |
|
"num_input_tokens_seen": 174063616, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.8286066584463625, |
|
"grad_norm": 0.07581306993961334, |
|
"learning_rate": 3.4144573775834134e-06, |
|
"loss": 0.7985, |
|
"num_input_tokens_seen": 176160768, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8384710234278668, |
|
"grad_norm": 0.08039968460798264, |
|
"learning_rate": 3.0326649975182865e-06, |
|
"loss": 0.7868, |
|
"num_input_tokens_seen": 178257920, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8483353884093712, |
|
"grad_norm": 0.07282786071300507, |
|
"learning_rate": 2.672124595330214e-06, |
|
"loss": 0.7972, |
|
"num_input_tokens_seen": 180355072, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8581997533908755, |
|
"grad_norm": 0.07389923185110092, |
|
"learning_rate": 2.333184970573188e-06, |
|
"loss": 0.7933, |
|
"num_input_tokens_seen": 182452224, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.8680641183723797, |
|
"grad_norm": 0.07451856881380081, |
|
"learning_rate": 2.0161740254440896e-06, |
|
"loss": 0.7695, |
|
"num_input_tokens_seen": 184549376, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.8779284833538841, |
|
"grad_norm": 0.0740065649151802, |
|
"learning_rate": 1.7213984475585144e-06, |
|
"loss": 0.7929, |
|
"num_input_tokens_seen": 186646528, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.8877928483353884, |
|
"grad_norm": 0.07559362053871155, |
|
"learning_rate": 1.4491434132502729e-06, |
|
"loss": 0.7995, |
|
"num_input_tokens_seen": 188743680, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8976572133168927, |
|
"grad_norm": 0.07598087936639786, |
|
"learning_rate": 1.1996723116817516e-06, |
|
"loss": 0.7857, |
|
"num_input_tokens_seen": 190840832, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.9075215782983971, |
|
"grad_norm": 0.0752101019024849, |
|
"learning_rate": 9.732264900318866e-07, |
|
"loss": 0.8215, |
|
"num_input_tokens_seen": 192937984, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.9173859432799013, |
|
"grad_norm": 0.07947240769863129, |
|
"learning_rate": 7.700250200083469e-07, |
|
"loss": 0.7931, |
|
"num_input_tokens_seen": 195035136, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.9272503082614056, |
|
"grad_norm": 0.07513375580310822, |
|
"learning_rate": 5.90264485909825e-07, |
|
"loss": 0.7927, |
|
"num_input_tokens_seen": 197132288, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.93711467324291, |
|
"grad_norm": 0.0716259703040123, |
|
"learning_rate": 4.341187944434083e-07, |
|
"loss": 0.7787, |
|
"num_input_tokens_seen": 199229440, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9469790382244143, |
|
"grad_norm": 0.07607847452163696, |
|
"learning_rate": 3.017390064810832e-07, |
|
"loss": 0.7771, |
|
"num_input_tokens_seen": 201326592, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.9568434032059187, |
|
"grad_norm": 0.07854241132736206, |
|
"learning_rate": 1.9325319091808847e-07, |
|
"loss": 0.7931, |
|
"num_input_tokens_seen": 203423744, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.966707768187423, |
|
"grad_norm": 0.07781317085027695, |
|
"learning_rate": 1.0876630077453487e-07, |
|
"loss": 0.7902, |
|
"num_input_tokens_seen": 205520896, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.9765721331689272, |
|
"grad_norm": 0.07628636062145233, |
|
"learning_rate": 4.836007166014178e-08, |
|
"loss": 0.7906, |
|
"num_input_tokens_seen": 207618048, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.9864364981504316, |
|
"grad_norm": 0.07593634724617004, |
|
"learning_rate": 1.2092942700298038e-08, |
|
"loss": 0.8083, |
|
"num_input_tokens_seen": 209715200, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9963008631319359, |
|
"grad_norm": 0.074070043861866, |
|
"learning_rate": 0.0, |
|
"loss": 0.7963, |
|
"num_input_tokens_seen": 211812352, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.9963008631319359, |
|
"num_input_tokens_seen": 211812352, |
|
"step": 101, |
|
"total_flos": 8.985866598858359e+18, |
|
"train_loss": 0.8237513356869763, |
|
"train_runtime": 15586.605, |
|
"train_samples_per_second": 3.328, |
|
"train_steps_per_second": 0.006 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 101, |
|
"num_input_tokens_seen": 211812352, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.985866598858359e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|