|
{ |
|
"best_metric": 1.3390412330627441, |
|
"best_model_checkpoint": "4bit_repro_03022025/host6_seed_42_full_det_fp16_no_flash_attn_fix_pad_llama-3.1-instruct-l16-cot-4ep-lr3e04-ws20-bs4-ga4-fp16-04022025/checkpoint-109", |
|
"epoch": 3.9655172413793105, |
|
"eval_steps": 500, |
|
"global_step": 432, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009195402298850575, |
|
"grad_norm": 1.2009892463684082, |
|
"learning_rate": 1.4999999999999999e-05, |
|
"loss": 2.4718, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01839080459770115, |
|
"grad_norm": 0.8724077939987183, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 2.479, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.027586206896551724, |
|
"grad_norm": 1.004928708076477, |
|
"learning_rate": 4.4999999999999996e-05, |
|
"loss": 2.3757, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0367816091954023, |
|
"grad_norm": 0.7040066123008728, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 2.2499, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04597701149425287, |
|
"grad_norm": 1.9864150285720825, |
|
"learning_rate": 7.5e-05, |
|
"loss": 2.3655, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05517241379310345, |
|
"grad_norm": 0.8593456149101257, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 2.2029, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06436781609195402, |
|
"grad_norm": 0.5284996628761292, |
|
"learning_rate": 0.00010499999999999999, |
|
"loss": 2.302, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0735632183908046, |
|
"grad_norm": 0.49511250853538513, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 2.2581, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08275862068965517, |
|
"grad_norm": 0.6772745847702026, |
|
"learning_rate": 0.000135, |
|
"loss": 2.1002, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09195402298850575, |
|
"grad_norm": 0.7531024217605591, |
|
"learning_rate": 0.00015, |
|
"loss": 1.8504, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10114942528735632, |
|
"grad_norm": 0.4855881631374359, |
|
"learning_rate": 0.000165, |
|
"loss": 1.8903, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1103448275862069, |
|
"grad_norm": 0.6145009994506836, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 1.868, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.11954022988505747, |
|
"grad_norm": 0.8082670569419861, |
|
"learning_rate": 0.000195, |
|
"loss": 1.554, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.12873563218390804, |
|
"grad_norm": 0.516228973865509, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 1.7451, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 0.4332088232040405, |
|
"learning_rate": 0.000225, |
|
"loss": 1.7337, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1471264367816092, |
|
"grad_norm": 0.497397243976593, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 1.949, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.15632183908045977, |
|
"grad_norm": 0.5272815227508545, |
|
"learning_rate": 0.00025499999999999996, |
|
"loss": 1.4676, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.16551724137931034, |
|
"grad_norm": 0.7768718600273132, |
|
"learning_rate": 0.00027, |
|
"loss": 1.4129, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.17471264367816092, |
|
"grad_norm": 0.43140313029289246, |
|
"learning_rate": 0.000285, |
|
"loss": 1.3876, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.1839080459770115, |
|
"grad_norm": 0.4079054892063141, |
|
"learning_rate": 0.0003, |
|
"loss": 1.6527, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.19310344827586207, |
|
"grad_norm": 0.39203184843063354, |
|
"learning_rate": 0.00029927184466019415, |
|
"loss": 1.7775, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.20229885057471264, |
|
"grad_norm": 0.35493654012680054, |
|
"learning_rate": 0.00029854368932038833, |
|
"loss": 1.6854, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.21149425287356322, |
|
"grad_norm": 0.4060712456703186, |
|
"learning_rate": 0.0002978155339805825, |
|
"loss": 1.4592, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.2206896551724138, |
|
"grad_norm": 0.3721208870410919, |
|
"learning_rate": 0.0002970873786407767, |
|
"loss": 1.571, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.22988505747126436, |
|
"grad_norm": 0.31752142310142517, |
|
"learning_rate": 0.00029635922330097087, |
|
"loss": 1.7418, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.23908045977011494, |
|
"grad_norm": 0.36056575179100037, |
|
"learning_rate": 0.00029563106796116505, |
|
"loss": 1.5262, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2482758620689655, |
|
"grad_norm": 0.4136730432510376, |
|
"learning_rate": 0.0002949029126213592, |
|
"loss": 1.4683, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.2574712643678161, |
|
"grad_norm": 0.382100909948349, |
|
"learning_rate": 0.00029417475728155335, |
|
"loss": 1.4068, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.38961261510849, |
|
"learning_rate": 0.00029344660194174753, |
|
"loss": 1.7098, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 0.3330027461051941, |
|
"learning_rate": 0.0002927184466019417, |
|
"loss": 1.8888, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2850574712643678, |
|
"grad_norm": 0.37608665227890015, |
|
"learning_rate": 0.0002919902912621359, |
|
"loss": 1.6444, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2942528735632184, |
|
"grad_norm": 0.3394232988357544, |
|
"learning_rate": 0.00029126213592233006, |
|
"loss": 1.4291, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.30344827586206896, |
|
"grad_norm": 0.5324225425720215, |
|
"learning_rate": 0.00029053398058252424, |
|
"loss": 1.29, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.31264367816091954, |
|
"grad_norm": 0.45669490098953247, |
|
"learning_rate": 0.0002898058252427184, |
|
"loss": 1.6305, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.3218390804597701, |
|
"grad_norm": 0.4215954840183258, |
|
"learning_rate": 0.0002890776699029126, |
|
"loss": 1.6635, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3310344827586207, |
|
"grad_norm": 0.44693419337272644, |
|
"learning_rate": 0.0002883495145631068, |
|
"loss": 1.4743, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.34022988505747126, |
|
"grad_norm": 0.35419154167175293, |
|
"learning_rate": 0.00028762135922330096, |
|
"loss": 1.6226, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.34942528735632183, |
|
"grad_norm": 0.32436278462409973, |
|
"learning_rate": 0.00028689320388349513, |
|
"loss": 1.7563, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3586206896551724, |
|
"grad_norm": 0.44368860125541687, |
|
"learning_rate": 0.0002861650485436893, |
|
"loss": 1.3193, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.367816091954023, |
|
"grad_norm": 0.3981421887874603, |
|
"learning_rate": 0.0002854368932038835, |
|
"loss": 1.739, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.37701149425287356, |
|
"grad_norm": 0.32864871621131897, |
|
"learning_rate": 0.00028470873786407767, |
|
"loss": 1.5729, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.38620689655172413, |
|
"grad_norm": 0.5265761017799377, |
|
"learning_rate": 0.00028398058252427185, |
|
"loss": 1.1951, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3954022988505747, |
|
"grad_norm": 0.34742191433906555, |
|
"learning_rate": 0.00028325242718446603, |
|
"loss": 1.4895, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.4045977011494253, |
|
"grad_norm": 0.3741191327571869, |
|
"learning_rate": 0.00028252427184466015, |
|
"loss": 1.4051, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 0.4229290187358856, |
|
"learning_rate": 0.00028179611650485433, |
|
"loss": 1.2828, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.42298850574712643, |
|
"grad_norm": 0.32790377736091614, |
|
"learning_rate": 0.0002810679611650485, |
|
"loss": 1.6367, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.432183908045977, |
|
"grad_norm": 0.41407954692840576, |
|
"learning_rate": 0.0002803398058252427, |
|
"loss": 1.4384, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4413793103448276, |
|
"grad_norm": 0.39225372672080994, |
|
"learning_rate": 0.00027961165048543687, |
|
"loss": 1.388, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.45057471264367815, |
|
"grad_norm": 0.4155985116958618, |
|
"learning_rate": 0.00027888349514563105, |
|
"loss": 1.3559, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 0.4461081027984619, |
|
"learning_rate": 0.0002781553398058252, |
|
"loss": 1.1852, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4689655172413793, |
|
"grad_norm": 0.3528655767440796, |
|
"learning_rate": 0.0002774271844660194, |
|
"loss": 1.544, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4781609195402299, |
|
"grad_norm": 0.3481675088405609, |
|
"learning_rate": 0.0002766990291262136, |
|
"loss": 1.3634, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.48735632183908045, |
|
"grad_norm": 0.42916133999824524, |
|
"learning_rate": 0.00027597087378640776, |
|
"loss": 1.2693, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.496551724137931, |
|
"grad_norm": 0.44757965207099915, |
|
"learning_rate": 0.00027524271844660194, |
|
"loss": 1.1648, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5057471264367817, |
|
"grad_norm": 0.3837111294269562, |
|
"learning_rate": 0.0002745145631067961, |
|
"loss": 1.5693, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5149425287356322, |
|
"grad_norm": 0.4521600604057312, |
|
"learning_rate": 0.0002737864077669903, |
|
"loss": 1.4492, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5241379310344828, |
|
"grad_norm": 0.4231400191783905, |
|
"learning_rate": 0.0002730582524271845, |
|
"loss": 1.2684, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.38788753747940063, |
|
"learning_rate": 0.00027233009708737865, |
|
"loss": 1.4203, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.542528735632184, |
|
"grad_norm": 0.46515601873397827, |
|
"learning_rate": 0.0002716019417475728, |
|
"loss": 1.3853, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.3752426207065582, |
|
"learning_rate": 0.00027087378640776696, |
|
"loss": 1.4567, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5609195402298851, |
|
"grad_norm": 0.41895580291748047, |
|
"learning_rate": 0.00027014563106796114, |
|
"loss": 1.3408, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5701149425287356, |
|
"grad_norm": 0.38586685061454773, |
|
"learning_rate": 0.0002694174757281553, |
|
"loss": 1.2865, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5793103448275863, |
|
"grad_norm": 0.46207135915756226, |
|
"learning_rate": 0.0002686893203883495, |
|
"loss": 1.2481, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5885057471264368, |
|
"grad_norm": 0.5671063661575317, |
|
"learning_rate": 0.00026796116504854367, |
|
"loss": 1.1445, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5977011494252874, |
|
"grad_norm": 0.7277198433876038, |
|
"learning_rate": 0.00026723300970873785, |
|
"loss": 1.4223, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6068965517241379, |
|
"grad_norm": 0.45053866505622864, |
|
"learning_rate": 0.00026650485436893203, |
|
"loss": 0.9746, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6160919540229886, |
|
"grad_norm": 0.4665685296058655, |
|
"learning_rate": 0.0002657766990291262, |
|
"loss": 1.3799, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6252873563218391, |
|
"grad_norm": 0.4472177028656006, |
|
"learning_rate": 0.0002650485436893204, |
|
"loss": 1.3808, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6344827586206897, |
|
"grad_norm": 0.624518871307373, |
|
"learning_rate": 0.00026432038834951456, |
|
"loss": 1.3923, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.6436781609195402, |
|
"grad_norm": 0.4509923458099365, |
|
"learning_rate": 0.00026359223300970874, |
|
"loss": 1.2953, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6528735632183909, |
|
"grad_norm": 0.4893573224544525, |
|
"learning_rate": 0.0002628640776699029, |
|
"loss": 1.2323, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.6620689655172414, |
|
"grad_norm": 0.4732131361961365, |
|
"learning_rate": 0.00026213592233009705, |
|
"loss": 1.1789, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.671264367816092, |
|
"grad_norm": 0.5011962056159973, |
|
"learning_rate": 0.0002614077669902912, |
|
"loss": 1.2789, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.6804597701149425, |
|
"grad_norm": 0.534753680229187, |
|
"learning_rate": 0.0002606796116504854, |
|
"loss": 1.4238, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.5315132141113281, |
|
"learning_rate": 0.0002599514563106796, |
|
"loss": 1.2394, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6988505747126437, |
|
"grad_norm": 0.6065260171890259, |
|
"learning_rate": 0.00025922330097087376, |
|
"loss": 1.4696, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7080459770114943, |
|
"grad_norm": 0.5865620374679565, |
|
"learning_rate": 0.00025849514563106794, |
|
"loss": 1.2078, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7172413793103448, |
|
"grad_norm": 0.4729297161102295, |
|
"learning_rate": 0.0002577669902912621, |
|
"loss": 1.1301, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7264367816091954, |
|
"grad_norm": 0.4988013505935669, |
|
"learning_rate": 0.0002570388349514563, |
|
"loss": 1.5613, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.735632183908046, |
|
"grad_norm": 0.4902057647705078, |
|
"learning_rate": 0.0002563106796116505, |
|
"loss": 1.3873, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7448275862068966, |
|
"grad_norm": 0.4350455403327942, |
|
"learning_rate": 0.0002555825242718446, |
|
"loss": 1.2618, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.7540229885057471, |
|
"grad_norm": 0.40848422050476074, |
|
"learning_rate": 0.0002548543689320388, |
|
"loss": 1.2219, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.7632183908045977, |
|
"grad_norm": 0.5879409313201904, |
|
"learning_rate": 0.00025412621359223296, |
|
"loss": 1.3057, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.7724137931034483, |
|
"grad_norm": 0.4928654730319977, |
|
"learning_rate": 0.00025339805825242714, |
|
"loss": 1.3198, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.7816091954022989, |
|
"grad_norm": 0.5202780961990356, |
|
"learning_rate": 0.0002526699029126213, |
|
"loss": 1.1353, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7908045977011494, |
|
"grad_norm": 0.580213725566864, |
|
"learning_rate": 0.0002519417475728155, |
|
"loss": 1.1763, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6902905702590942, |
|
"learning_rate": 0.00025121359223300967, |
|
"loss": 1.3229, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.8091954022988506, |
|
"grad_norm": 0.447915256023407, |
|
"learning_rate": 0.00025048543689320385, |
|
"loss": 1.2624, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.8183908045977012, |
|
"grad_norm": 0.4989657402038574, |
|
"learning_rate": 0.00024975728155339803, |
|
"loss": 1.1308, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 0.5150200724601746, |
|
"learning_rate": 0.0002490291262135922, |
|
"loss": 0.9972, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8367816091954023, |
|
"grad_norm": 0.509914219379425, |
|
"learning_rate": 0.0002483009708737864, |
|
"loss": 1.2623, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.8459770114942529, |
|
"grad_norm": 0.5083137154579163, |
|
"learning_rate": 0.00024757281553398056, |
|
"loss": 1.099, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.8551724137931035, |
|
"grad_norm": 0.5252517461776733, |
|
"learning_rate": 0.00024684466019417474, |
|
"loss": 1.0284, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.864367816091954, |
|
"grad_norm": 0.643261730670929, |
|
"learning_rate": 0.0002461165048543689, |
|
"loss": 1.1678, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.8735632183908046, |
|
"grad_norm": 0.6660665273666382, |
|
"learning_rate": 0.0002453883495145631, |
|
"loss": 1.2439, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8827586206896552, |
|
"grad_norm": 0.45542895793914795, |
|
"learning_rate": 0.0002446601941747572, |
|
"loss": 1.0297, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.8919540229885058, |
|
"grad_norm": 0.505804181098938, |
|
"learning_rate": 0.00024393203883495143, |
|
"loss": 1.1916, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.9011494252873563, |
|
"grad_norm": 0.4834766685962677, |
|
"learning_rate": 0.0002432038834951456, |
|
"loss": 1.246, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.9103448275862069, |
|
"grad_norm": 0.586925745010376, |
|
"learning_rate": 0.0002424757281553398, |
|
"loss": 1.1541, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 0.4611804187297821, |
|
"learning_rate": 0.00024174757281553394, |
|
"loss": 1.1095, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9287356321839081, |
|
"grad_norm": 0.5377518534660339, |
|
"learning_rate": 0.00024101941747572812, |
|
"loss": 1.0846, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.9379310344827586, |
|
"grad_norm": 0.797167181968689, |
|
"learning_rate": 0.0002402912621359223, |
|
"loss": 1.0701, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.9471264367816092, |
|
"grad_norm": 0.5871890187263489, |
|
"learning_rate": 0.00023956310679611648, |
|
"loss": 1.034, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.9563218390804598, |
|
"grad_norm": 0.5478411316871643, |
|
"learning_rate": 0.00023883495145631065, |
|
"loss": 0.9545, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 0.5382391214370728, |
|
"learning_rate": 0.00023810679611650483, |
|
"loss": 1.0392, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9747126436781609, |
|
"grad_norm": 0.5287902355194092, |
|
"learning_rate": 0.000237378640776699, |
|
"loss": 1.2706, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.9839080459770115, |
|
"grad_norm": 0.45206359028816223, |
|
"learning_rate": 0.0002366504854368932, |
|
"loss": 1.1491, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.993103448275862, |
|
"grad_norm": 0.5216997265815735, |
|
"learning_rate": 0.00023592233009708734, |
|
"loss": 0.9557, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5584304332733154, |
|
"learning_rate": 0.00023519417475728152, |
|
"loss": 1.1317, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.3390412330627441, |
|
"eval_runtime": 71.2164, |
|
"eval_samples_per_second": 4.662, |
|
"eval_steps_per_second": 2.331, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.0091954022988505, |
|
"grad_norm": 0.5057152509689331, |
|
"learning_rate": 0.0002344660194174757, |
|
"loss": 0.9771, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.018390804597701, |
|
"grad_norm": 0.4914540648460388, |
|
"learning_rate": 0.00023373786407766988, |
|
"loss": 0.9691, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.0275862068965518, |
|
"grad_norm": 0.5995070934295654, |
|
"learning_rate": 0.00023300970873786406, |
|
"loss": 0.8675, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.0367816091954023, |
|
"grad_norm": 0.841068685054779, |
|
"learning_rate": 0.00023228155339805823, |
|
"loss": 1.1067, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.0459770114942528, |
|
"grad_norm": 0.566326916217804, |
|
"learning_rate": 0.0002315533980582524, |
|
"loss": 0.9868, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.0551724137931036, |
|
"grad_norm": 0.6521961688995361, |
|
"learning_rate": 0.0002308252427184466, |
|
"loss": 0.9899, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.064367816091954, |
|
"grad_norm": 0.4797298312187195, |
|
"learning_rate": 0.00023009708737864074, |
|
"loss": 0.9234, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.0735632183908046, |
|
"grad_norm": 0.5537835955619812, |
|
"learning_rate": 0.00022936893203883492, |
|
"loss": 1.0399, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.0827586206896551, |
|
"grad_norm": 0.5003221035003662, |
|
"learning_rate": 0.0002286407766990291, |
|
"loss": 1.0128, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.0919540229885056, |
|
"grad_norm": 0.4969468414783478, |
|
"learning_rate": 0.00022791262135922328, |
|
"loss": 0.9351, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.1011494252873564, |
|
"grad_norm": 0.4632287919521332, |
|
"learning_rate": 0.00022718446601941746, |
|
"loss": 0.9637, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.110344827586207, |
|
"grad_norm": 0.6883680820465088, |
|
"learning_rate": 0.00022645631067961164, |
|
"loss": 0.9918, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.1195402298850574, |
|
"grad_norm": 0.5814421772956848, |
|
"learning_rate": 0.00022572815533980582, |
|
"loss": 0.8679, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.1287356321839082, |
|
"grad_norm": 0.62347012758255, |
|
"learning_rate": 0.000225, |
|
"loss": 0.954, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.1379310344827587, |
|
"grad_norm": 0.6778735518455505, |
|
"learning_rate": 0.00022427184466019415, |
|
"loss": 0.9919, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.1471264367816092, |
|
"grad_norm": 0.6114857792854309, |
|
"learning_rate": 0.00022354368932038832, |
|
"loss": 0.8959, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.1563218390804597, |
|
"grad_norm": 0.6160824298858643, |
|
"learning_rate": 0.0002228155339805825, |
|
"loss": 0.8242, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.1655172413793102, |
|
"grad_norm": 0.7020765542984009, |
|
"learning_rate": 0.00022208737864077668, |
|
"loss": 0.8819, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.174712643678161, |
|
"grad_norm": 0.5306417346000671, |
|
"learning_rate": 0.00022135922330097086, |
|
"loss": 0.9443, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.1839080459770115, |
|
"grad_norm": 0.5717440843582153, |
|
"learning_rate": 0.00022063106796116504, |
|
"loss": 0.9057, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.193103448275862, |
|
"grad_norm": 0.7055572271347046, |
|
"learning_rate": 0.00021990291262135922, |
|
"loss": 0.8812, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2022988505747128, |
|
"grad_norm": 0.5051671266555786, |
|
"learning_rate": 0.00021917475728155337, |
|
"loss": 0.7836, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.2114942528735633, |
|
"grad_norm": 0.5158051252365112, |
|
"learning_rate": 0.00021844660194174755, |
|
"loss": 0.8701, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.2206896551724138, |
|
"grad_norm": 0.5222744941711426, |
|
"learning_rate": 0.00021771844660194173, |
|
"loss": 0.8326, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.2298850574712643, |
|
"grad_norm": 1.2943826913833618, |
|
"learning_rate": 0.0002169902912621359, |
|
"loss": 0.8821, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.2390804597701148, |
|
"grad_norm": 0.6272923350334167, |
|
"learning_rate": 0.00021626213592233008, |
|
"loss": 0.9379, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.2482758620689656, |
|
"grad_norm": 0.5198677778244019, |
|
"learning_rate": 0.00021553398058252426, |
|
"loss": 1.0225, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.257471264367816, |
|
"grad_norm": 0.5434890985488892, |
|
"learning_rate": 0.00021480582524271844, |
|
"loss": 0.8091, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.2666666666666666, |
|
"grad_norm": 0.5759937167167664, |
|
"learning_rate": 0.00021407766990291262, |
|
"loss": 0.9161, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.2758620689655173, |
|
"grad_norm": 0.6376967430114746, |
|
"learning_rate": 0.00021334951456310677, |
|
"loss": 1.0129, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.2850574712643679, |
|
"grad_norm": 0.5497775673866272, |
|
"learning_rate": 0.00021262135922330095, |
|
"loss": 0.8934, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2942528735632184, |
|
"grad_norm": 0.5189297199249268, |
|
"learning_rate": 0.00021189320388349513, |
|
"loss": 1.0166, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.303448275862069, |
|
"grad_norm": 0.5342044234275818, |
|
"learning_rate": 0.0002111650485436893, |
|
"loss": 0.7326, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.3126436781609194, |
|
"grad_norm": 0.6083195209503174, |
|
"learning_rate": 0.00021043689320388349, |
|
"loss": 0.6994, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.3218390804597702, |
|
"grad_norm": 0.6031423807144165, |
|
"learning_rate": 0.00020970873786407766, |
|
"loss": 0.8656, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.3310344827586207, |
|
"grad_norm": 0.7030931711196899, |
|
"learning_rate": 0.00020898058252427184, |
|
"loss": 0.7279, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3402298850574712, |
|
"grad_norm": 0.6062871217727661, |
|
"learning_rate": 0.00020825242718446602, |
|
"loss": 0.7569, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.349425287356322, |
|
"grad_norm": 0.6677907109260559, |
|
"learning_rate": 0.00020752427184466017, |
|
"loss": 0.8127, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.3586206896551725, |
|
"grad_norm": 0.5375317335128784, |
|
"learning_rate": 0.00020679611650485435, |
|
"loss": 0.7453, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.367816091954023, |
|
"grad_norm": 0.6816096305847168, |
|
"learning_rate": 0.00020606796116504853, |
|
"loss": 0.6707, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.3770114942528735, |
|
"grad_norm": 0.4937184751033783, |
|
"learning_rate": 0.0002053398058252427, |
|
"loss": 0.6826, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.386206896551724, |
|
"grad_norm": 0.5203655958175659, |
|
"learning_rate": 0.0002046116504854369, |
|
"loss": 0.8743, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.3954022988505748, |
|
"grad_norm": 0.6682159900665283, |
|
"learning_rate": 0.00020388349514563107, |
|
"loss": 0.8367, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.4045977011494253, |
|
"grad_norm": 0.5017151832580566, |
|
"learning_rate": 0.00020315533980582524, |
|
"loss": 0.9242, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.4137931034482758, |
|
"grad_norm": 0.5524232387542725, |
|
"learning_rate": 0.00020242718446601942, |
|
"loss": 0.8313, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.4229885057471265, |
|
"grad_norm": 0.5339857935905457, |
|
"learning_rate": 0.00020169902912621357, |
|
"loss": 0.6427, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.432183908045977, |
|
"grad_norm": 0.5582138299942017, |
|
"learning_rate": 0.00020097087378640775, |
|
"loss": 0.8925, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.4413793103448276, |
|
"grad_norm": 0.5109941363334656, |
|
"learning_rate": 0.00020024271844660193, |
|
"loss": 0.7434, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.450574712643678, |
|
"grad_norm": 0.45144638419151306, |
|
"learning_rate": 0.0001995145631067961, |
|
"loss": 0.6127, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.4597701149425286, |
|
"grad_norm": 0.6283529996871948, |
|
"learning_rate": 0.0001987864077669903, |
|
"loss": 0.773, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.4689655172413794, |
|
"grad_norm": 0.7730019688606262, |
|
"learning_rate": 0.00019805825242718447, |
|
"loss": 0.9172, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.4781609195402299, |
|
"grad_norm": 0.6293427348136902, |
|
"learning_rate": 0.00019733009708737865, |
|
"loss": 0.9722, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.4873563218390804, |
|
"grad_norm": 0.44639852643013, |
|
"learning_rate": 0.0001966019417475728, |
|
"loss": 0.6411, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.4965517241379311, |
|
"grad_norm": 0.8958466053009033, |
|
"learning_rate": 0.00019587378640776698, |
|
"loss": 0.721, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.5057471264367817, |
|
"grad_norm": 0.5399364829063416, |
|
"learning_rate": 0.00019514563106796116, |
|
"loss": 0.643, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.5149425287356322, |
|
"grad_norm": 0.5027474761009216, |
|
"learning_rate": 0.00019441747572815533, |
|
"loss": 0.8111, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.524137931034483, |
|
"grad_norm": 0.47656288743019104, |
|
"learning_rate": 0.0001936893203883495, |
|
"loss": 0.7836, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.5333333333333332, |
|
"grad_norm": 0.45420220494270325, |
|
"learning_rate": 0.0001929611650485437, |
|
"loss": 0.7536, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.542528735632184, |
|
"grad_norm": 0.9322930574417114, |
|
"learning_rate": 0.00019223300970873787, |
|
"loss": 0.8202, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 0.738074004650116, |
|
"learning_rate": 0.00019150485436893205, |
|
"loss": 0.7343, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.560919540229885, |
|
"grad_norm": 0.5796405076980591, |
|
"learning_rate": 0.0001907766990291262, |
|
"loss": 0.656, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.5701149425287357, |
|
"grad_norm": 0.48484688997268677, |
|
"learning_rate": 0.00019004854368932038, |
|
"loss": 0.9305, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.5793103448275863, |
|
"grad_norm": 0.697944700717926, |
|
"learning_rate": 0.00018932038834951456, |
|
"loss": 0.8577, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.5885057471264368, |
|
"grad_norm": 0.5762362480163574, |
|
"learning_rate": 0.00018859223300970874, |
|
"loss": 0.8449, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.5977011494252875, |
|
"grad_norm": 0.5346084833145142, |
|
"learning_rate": 0.00018786407766990291, |
|
"loss": 0.6286, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.6068965517241378, |
|
"grad_norm": 0.5398671627044678, |
|
"learning_rate": 0.00018713592233009707, |
|
"loss": 0.7469, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6160919540229886, |
|
"grad_norm": 0.5442600250244141, |
|
"learning_rate": 0.00018640776699029122, |
|
"loss": 0.827, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.625287356321839, |
|
"grad_norm": 0.5366424918174744, |
|
"learning_rate": 0.0001856796116504854, |
|
"loss": 0.5276, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.6344827586206896, |
|
"grad_norm": 0.558635413646698, |
|
"learning_rate": 0.00018495145631067957, |
|
"loss": 0.5042, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.6436781609195403, |
|
"grad_norm": 0.5313374996185303, |
|
"learning_rate": 0.00018422330097087375, |
|
"loss": 0.8407, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.6528735632183909, |
|
"grad_norm": 0.5416716933250427, |
|
"learning_rate": 0.00018349514563106793, |
|
"loss": 0.7487, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6620689655172414, |
|
"grad_norm": 0.5735067129135132, |
|
"learning_rate": 0.0001827669902912621, |
|
"loss": 0.7743, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.6712643678160921, |
|
"grad_norm": 0.5936351418495178, |
|
"learning_rate": 0.0001820388349514563, |
|
"loss": 0.8029, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.6804597701149424, |
|
"grad_norm": 0.6284213066101074, |
|
"learning_rate": 0.00018131067961165047, |
|
"loss": 0.6631, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.6896551724137931, |
|
"grad_norm": 0.512428343296051, |
|
"learning_rate": 0.00018058252427184462, |
|
"loss": 0.8344, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.6988505747126437, |
|
"grad_norm": 0.446748822927475, |
|
"learning_rate": 0.0001798543689320388, |
|
"loss": 0.7456, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7080459770114942, |
|
"grad_norm": 0.5570324659347534, |
|
"learning_rate": 0.00017912621359223298, |
|
"loss": 0.6645, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.717241379310345, |
|
"grad_norm": 0.8319283127784729, |
|
"learning_rate": 0.00017839805825242716, |
|
"loss": 0.631, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.7264367816091954, |
|
"grad_norm": 0.8194088935852051, |
|
"learning_rate": 0.00017766990291262133, |
|
"loss": 0.7288, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.735632183908046, |
|
"grad_norm": 0.5631998777389526, |
|
"learning_rate": 0.0001769417475728155, |
|
"loss": 0.7012, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.7448275862068967, |
|
"grad_norm": 0.4876724183559418, |
|
"learning_rate": 0.0001762135922330097, |
|
"loss": 0.7046, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.754022988505747, |
|
"grad_norm": 0.5962346792221069, |
|
"learning_rate": 0.00017548543689320387, |
|
"loss": 0.7304, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.7632183908045977, |
|
"grad_norm": 0.543746292591095, |
|
"learning_rate": 0.00017475728155339802, |
|
"loss": 0.7605, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.7724137931034483, |
|
"grad_norm": 0.5156055688858032, |
|
"learning_rate": 0.0001740291262135922, |
|
"loss": 0.6376, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.7816091954022988, |
|
"grad_norm": 0.5197211503982544, |
|
"learning_rate": 0.00017330097087378638, |
|
"loss": 0.5259, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.7908045977011495, |
|
"grad_norm": 0.5126516222953796, |
|
"learning_rate": 0.00017257281553398056, |
|
"loss": 0.6008, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.6471008658409119, |
|
"learning_rate": 0.00017184466019417474, |
|
"loss": 0.6885, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.8091954022988506, |
|
"grad_norm": 0.9358019828796387, |
|
"learning_rate": 0.00017111650485436891, |
|
"loss": 0.7411, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.8183908045977013, |
|
"grad_norm": 0.7807464003562927, |
|
"learning_rate": 0.0001703883495145631, |
|
"loss": 0.7555, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.8275862068965516, |
|
"grad_norm": 0.5318418741226196, |
|
"learning_rate": 0.00016966019417475724, |
|
"loss": 0.6653, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.8367816091954023, |
|
"grad_norm": 0.4897342622280121, |
|
"learning_rate": 0.00016893203883495142, |
|
"loss": 0.5241, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8459770114942529, |
|
"grad_norm": 0.44622400403022766, |
|
"learning_rate": 0.0001682038834951456, |
|
"loss": 0.5036, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.8551724137931034, |
|
"grad_norm": 0.5487281680107117, |
|
"learning_rate": 0.00016747572815533978, |
|
"loss": 0.5633, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.8643678160919541, |
|
"grad_norm": 0.43121638894081116, |
|
"learning_rate": 0.00016674757281553396, |
|
"loss": 0.7004, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.8735632183908046, |
|
"grad_norm": 0.44173818826675415, |
|
"learning_rate": 0.00016601941747572814, |
|
"loss": 0.7274, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.8827586206896552, |
|
"grad_norm": 0.583820104598999, |
|
"learning_rate": 0.00016529126213592232, |
|
"loss": 0.6553, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.891954022988506, |
|
"grad_norm": 0.4245299994945526, |
|
"learning_rate": 0.0001645631067961165, |
|
"loss": 0.6727, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.9011494252873562, |
|
"grad_norm": 0.4857756495475769, |
|
"learning_rate": 0.00016383495145631065, |
|
"loss": 0.6137, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.910344827586207, |
|
"grad_norm": 0.5263829827308655, |
|
"learning_rate": 0.00016310679611650483, |
|
"loss": 0.8124, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.9195402298850575, |
|
"grad_norm": 0.5227271914482117, |
|
"learning_rate": 0.000162378640776699, |
|
"loss": 0.6744, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.928735632183908, |
|
"grad_norm": 0.5538279414176941, |
|
"learning_rate": 0.00016165048543689318, |
|
"loss": 0.6061, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9379310344827587, |
|
"grad_norm": 0.5026021003723145, |
|
"learning_rate": 0.00016092233009708736, |
|
"loss": 0.5295, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.9471264367816092, |
|
"grad_norm": 0.5182415843009949, |
|
"learning_rate": 0.00016019417475728154, |
|
"loss": 0.6786, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.9563218390804598, |
|
"grad_norm": 0.441493958234787, |
|
"learning_rate": 0.00015946601941747572, |
|
"loss": 0.5121, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.9655172413793105, |
|
"grad_norm": 0.4822331666946411, |
|
"learning_rate": 0.0001587378640776699, |
|
"loss": 0.541, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.9747126436781608, |
|
"grad_norm": 0.5318118333816528, |
|
"learning_rate": 0.00015800970873786405, |
|
"loss": 0.9281, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.9839080459770115, |
|
"grad_norm": 0.5313464403152466, |
|
"learning_rate": 0.00015728155339805823, |
|
"loss": 0.9183, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.993103448275862, |
|
"grad_norm": 0.5637298226356506, |
|
"learning_rate": 0.0001565533980582524, |
|
"loss": 0.7542, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.6160908341407776, |
|
"learning_rate": 0.00015582524271844658, |
|
"loss": 0.8758, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.3436731100082397, |
|
"eval_runtime": 71.2384, |
|
"eval_samples_per_second": 4.66, |
|
"eval_steps_per_second": 2.33, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.0091954022988507, |
|
"grad_norm": 0.5377904772758484, |
|
"learning_rate": 0.00015509708737864076, |
|
"loss": 0.4721, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.018390804597701, |
|
"grad_norm": 0.4621603488922119, |
|
"learning_rate": 0.00015436893203883494, |
|
"loss": 0.6065, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.027586206896552, |
|
"grad_norm": 0.43245628476142883, |
|
"learning_rate": 0.00015364077669902912, |
|
"loss": 0.5602, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.036781609195402, |
|
"grad_norm": 0.522826611995697, |
|
"learning_rate": 0.0001529126213592233, |
|
"loss": 0.6172, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.045977011494253, |
|
"grad_norm": 0.35645684599876404, |
|
"learning_rate": 0.00015218446601941745, |
|
"loss": 0.4566, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.0551724137931036, |
|
"grad_norm": 0.5614010095596313, |
|
"learning_rate": 0.00015145631067961163, |
|
"loss": 0.5109, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.064367816091954, |
|
"grad_norm": 0.44048449397087097, |
|
"learning_rate": 0.0001507281553398058, |
|
"loss": 0.5269, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.0735632183908046, |
|
"grad_norm": 0.4516451358795166, |
|
"learning_rate": 0.00015, |
|
"loss": 0.46, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.0827586206896553, |
|
"grad_norm": 0.6180903315544128, |
|
"learning_rate": 0.00014927184466019417, |
|
"loss": 0.7842, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.0919540229885056, |
|
"grad_norm": 0.579887330532074, |
|
"learning_rate": 0.00014854368932038834, |
|
"loss": 0.4769, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.1011494252873564, |
|
"grad_norm": 0.4342517554759979, |
|
"learning_rate": 0.00014781553398058252, |
|
"loss": 0.4867, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.110344827586207, |
|
"grad_norm": 0.45214682817459106, |
|
"learning_rate": 0.00014708737864077667, |
|
"loss": 0.525, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.1195402298850574, |
|
"grad_norm": 0.4345957934856415, |
|
"learning_rate": 0.00014635922330097085, |
|
"loss": 0.4967, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.128735632183908, |
|
"grad_norm": 0.5357015132904053, |
|
"learning_rate": 0.00014563106796116503, |
|
"loss": 0.5978, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.1379310344827585, |
|
"grad_norm": 0.4193997383117676, |
|
"learning_rate": 0.0001449029126213592, |
|
"loss": 0.4745, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.147126436781609, |
|
"grad_norm": 0.552384614944458, |
|
"learning_rate": 0.0001441747572815534, |
|
"loss": 0.6451, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.15632183908046, |
|
"grad_norm": 0.5584462285041809, |
|
"learning_rate": 0.00014344660194174757, |
|
"loss": 0.6536, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.1655172413793102, |
|
"grad_norm": 0.5344610810279846, |
|
"learning_rate": 0.00014271844660194175, |
|
"loss": 0.5067, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.174712643678161, |
|
"grad_norm": 0.48159855604171753, |
|
"learning_rate": 0.00014199029126213592, |
|
"loss": 0.6179, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.1839080459770113, |
|
"grad_norm": 0.48250091075897217, |
|
"learning_rate": 0.00014126213592233008, |
|
"loss": 0.6128, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.193103448275862, |
|
"grad_norm": 0.3940607011318207, |
|
"learning_rate": 0.00014053398058252425, |
|
"loss": 0.496, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.2022988505747128, |
|
"grad_norm": 0.42841699719429016, |
|
"learning_rate": 0.00013980582524271843, |
|
"loss": 0.4572, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.211494252873563, |
|
"grad_norm": 0.5661161541938782, |
|
"learning_rate": 0.0001390776699029126, |
|
"loss": 0.7899, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.220689655172414, |
|
"grad_norm": 0.5432340502738953, |
|
"learning_rate": 0.0001383495145631068, |
|
"loss": 0.4043, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.2298850574712645, |
|
"grad_norm": 0.4754869043827057, |
|
"learning_rate": 0.00013762135922330097, |
|
"loss": 0.408, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.239080459770115, |
|
"grad_norm": 0.4958679974079132, |
|
"learning_rate": 0.00013689320388349515, |
|
"loss": 0.6578, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.2482758620689656, |
|
"grad_norm": 0.5558366775512695, |
|
"learning_rate": 0.00013616504854368933, |
|
"loss": 0.7971, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.2574712643678163, |
|
"grad_norm": 0.45863062143325806, |
|
"learning_rate": 0.00013543689320388348, |
|
"loss": 0.3698, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.2666666666666666, |
|
"grad_norm": 0.5656180381774902, |
|
"learning_rate": 0.00013470873786407766, |
|
"loss": 0.7491, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.2758620689655173, |
|
"grad_norm": 0.3734663724899292, |
|
"learning_rate": 0.00013398058252427184, |
|
"loss": 0.3697, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.2850574712643676, |
|
"grad_norm": 0.6065300107002258, |
|
"learning_rate": 0.00013325242718446601, |
|
"loss": 0.5937, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.2942528735632184, |
|
"grad_norm": 0.5307329297065735, |
|
"learning_rate": 0.0001325242718446602, |
|
"loss": 0.8089, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.303448275862069, |
|
"grad_norm": 0.41086483001708984, |
|
"learning_rate": 0.00013179611650485437, |
|
"loss": 0.4857, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.3126436781609194, |
|
"grad_norm": 0.5805680751800537, |
|
"learning_rate": 0.00013106796116504852, |
|
"loss": 0.4235, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.32183908045977, |
|
"grad_norm": 0.472192645072937, |
|
"learning_rate": 0.0001303398058252427, |
|
"loss": 0.5524, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.3310344827586205, |
|
"grad_norm": 0.41721925139427185, |
|
"learning_rate": 0.00012961165048543688, |
|
"loss": 0.4386, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.340229885057471, |
|
"grad_norm": 0.42330309748649597, |
|
"learning_rate": 0.00012888349514563106, |
|
"loss": 0.4517, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.349425287356322, |
|
"grad_norm": 0.44302302598953247, |
|
"learning_rate": 0.00012815533980582524, |
|
"loss": 0.5756, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.3586206896551722, |
|
"grad_norm": 0.6171020865440369, |
|
"learning_rate": 0.0001274271844660194, |
|
"loss": 0.781, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.367816091954023, |
|
"grad_norm": 0.5262989401817322, |
|
"learning_rate": 0.00012669902912621357, |
|
"loss": 0.4601, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.3770114942528737, |
|
"grad_norm": 0.4261441230773926, |
|
"learning_rate": 0.00012597087378640775, |
|
"loss": 0.5883, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.386206896551724, |
|
"grad_norm": 0.5421766042709351, |
|
"learning_rate": 0.00012524271844660192, |
|
"loss": 0.6253, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.3954022988505748, |
|
"grad_norm": 0.52345871925354, |
|
"learning_rate": 0.0001245145631067961, |
|
"loss": 0.5872, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.4045977011494255, |
|
"grad_norm": 0.36535054445266724, |
|
"learning_rate": 0.00012378640776699028, |
|
"loss": 0.4097, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"grad_norm": 0.5453282594680786, |
|
"learning_rate": 0.00012305825242718446, |
|
"loss": 0.3713, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.4229885057471265, |
|
"grad_norm": 0.4698062837123871, |
|
"learning_rate": 0.0001223300970873786, |
|
"loss": 0.3663, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.432183908045977, |
|
"grad_norm": 0.41437670588493347, |
|
"learning_rate": 0.0001216019417475728, |
|
"loss": 0.4332, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.4413793103448276, |
|
"grad_norm": 0.5256748795509338, |
|
"learning_rate": 0.00012087378640776697, |
|
"loss": 0.6989, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.4505747126436783, |
|
"grad_norm": 0.45884081721305847, |
|
"learning_rate": 0.00012014563106796115, |
|
"loss": 0.5406, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.4597701149425286, |
|
"grad_norm": 0.391105592250824, |
|
"learning_rate": 0.00011941747572815533, |
|
"loss": 0.4213, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.4689655172413794, |
|
"grad_norm": 0.41228482127189636, |
|
"learning_rate": 0.0001186893203883495, |
|
"loss": 0.4283, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.4781609195402297, |
|
"grad_norm": 0.37000706791877747, |
|
"learning_rate": 0.00011796116504854367, |
|
"loss": 0.3736, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.4873563218390804, |
|
"grad_norm": 0.4249720871448517, |
|
"learning_rate": 0.00011723300970873785, |
|
"loss": 0.4086, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.496551724137931, |
|
"grad_norm": 0.4845840036869049, |
|
"learning_rate": 0.00011650485436893203, |
|
"loss": 0.8329, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.5057471264367814, |
|
"grad_norm": 0.3498639762401581, |
|
"learning_rate": 0.0001157766990291262, |
|
"loss": 0.2863, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.514942528735632, |
|
"grad_norm": 0.3682704567909241, |
|
"learning_rate": 0.00011504854368932037, |
|
"loss": 0.489, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.524137931034483, |
|
"grad_norm": 0.4684116244316101, |
|
"learning_rate": 0.00011432038834951455, |
|
"loss": 0.5256, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.533333333333333, |
|
"grad_norm": 0.22496339678764343, |
|
"learning_rate": 0.00011359223300970873, |
|
"loss": 0.2619, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.542528735632184, |
|
"grad_norm": 0.4395711421966553, |
|
"learning_rate": 0.00011286407766990291, |
|
"loss": 0.5677, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.5517241379310347, |
|
"grad_norm": 0.4096417725086212, |
|
"learning_rate": 0.00011213592233009707, |
|
"loss": 0.5133, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.560919540229885, |
|
"grad_norm": 0.31631290912628174, |
|
"learning_rate": 0.00011140776699029125, |
|
"loss": 0.3498, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.5701149425287357, |
|
"grad_norm": 0.2988245487213135, |
|
"learning_rate": 0.00011067961165048543, |
|
"loss": 0.3324, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.5793103448275865, |
|
"grad_norm": 0.3820885121822357, |
|
"learning_rate": 0.00010995145631067961, |
|
"loss": 0.4259, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.5885057471264368, |
|
"grad_norm": 0.39478757977485657, |
|
"learning_rate": 0.00010922330097087377, |
|
"loss": 0.432, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.5977011494252875, |
|
"grad_norm": 0.5134022235870361, |
|
"learning_rate": 0.00010849514563106795, |
|
"loss": 0.6062, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.606896551724138, |
|
"grad_norm": 0.30425119400024414, |
|
"learning_rate": 0.00010776699029126213, |
|
"loss": 0.3041, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.6160919540229886, |
|
"grad_norm": 0.6497390270233154, |
|
"learning_rate": 0.00010703883495145631, |
|
"loss": 0.4152, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.625287356321839, |
|
"grad_norm": 0.41369131207466125, |
|
"learning_rate": 0.00010631067961165047, |
|
"loss": 0.4364, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.6344827586206896, |
|
"grad_norm": 0.3804008960723877, |
|
"learning_rate": 0.00010558252427184465, |
|
"loss": 0.3777, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.6436781609195403, |
|
"grad_norm": 0.2623213529586792, |
|
"learning_rate": 0.00010485436893203883, |
|
"loss": 0.2425, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.6528735632183906, |
|
"grad_norm": 0.43846645951271057, |
|
"learning_rate": 0.00010412621359223301, |
|
"loss": 0.5522, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.6620689655172414, |
|
"grad_norm": 0.45738276839256287, |
|
"learning_rate": 0.00010339805825242718, |
|
"loss": 0.5292, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.671264367816092, |
|
"grad_norm": 0.42997756600379944, |
|
"learning_rate": 0.00010266990291262135, |
|
"loss": 0.6032, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.6804597701149424, |
|
"grad_norm": 0.37619197368621826, |
|
"learning_rate": 0.00010194174757281553, |
|
"loss": 0.3905, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.689655172413793, |
|
"grad_norm": 0.41694867610931396, |
|
"learning_rate": 0.00010121359223300971, |
|
"loss": 0.581, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.698850574712644, |
|
"grad_norm": 0.3665846288204193, |
|
"learning_rate": 0.00010048543689320388, |
|
"loss": 0.4677, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.708045977011494, |
|
"grad_norm": 0.6262102127075195, |
|
"learning_rate": 9.975728155339806e-05, |
|
"loss": 0.7874, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.717241379310345, |
|
"grad_norm": 0.5264686942100525, |
|
"learning_rate": 9.902912621359223e-05, |
|
"loss": 0.6461, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.7264367816091957, |
|
"grad_norm": 0.3872581422328949, |
|
"learning_rate": 9.83009708737864e-05, |
|
"loss": 0.4463, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.735632183908046, |
|
"grad_norm": 0.4705924391746521, |
|
"learning_rate": 9.757281553398058e-05, |
|
"loss": 0.5875, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.7448275862068967, |
|
"grad_norm": 0.34030237793922424, |
|
"learning_rate": 9.684466019417476e-05, |
|
"loss": 0.4063, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.754022988505747, |
|
"grad_norm": 0.4221409857273102, |
|
"learning_rate": 9.611650485436893e-05, |
|
"loss": 0.4782, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.7632183908045977, |
|
"grad_norm": 0.40130722522735596, |
|
"learning_rate": 9.53883495145631e-05, |
|
"loss": 0.4659, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.772413793103448, |
|
"grad_norm": 0.512660801410675, |
|
"learning_rate": 9.466019417475728e-05, |
|
"loss": 0.5244, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.781609195402299, |
|
"grad_norm": 0.36750540137290955, |
|
"learning_rate": 9.393203883495146e-05, |
|
"loss": 0.3848, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.7908045977011495, |
|
"grad_norm": 0.5015677213668823, |
|
"learning_rate": 9.320388349514561e-05, |
|
"loss": 0.5958, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.36262983083724976, |
|
"learning_rate": 9.247572815533979e-05, |
|
"loss": 0.3962, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.8091954022988506, |
|
"grad_norm": 0.34707215428352356, |
|
"learning_rate": 9.174757281553397e-05, |
|
"loss": 0.3028, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.8183908045977013, |
|
"grad_norm": 0.5340617299079895, |
|
"learning_rate": 9.101941747572814e-05, |
|
"loss": 0.7775, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.8275862068965516, |
|
"grad_norm": 0.26941734552383423, |
|
"learning_rate": 9.029126213592231e-05, |
|
"loss": 0.3095, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.8367816091954023, |
|
"grad_norm": 0.33745089173316956, |
|
"learning_rate": 8.956310679611649e-05, |
|
"loss": 0.363, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.845977011494253, |
|
"grad_norm": 0.39796075224876404, |
|
"learning_rate": 8.883495145631067e-05, |
|
"loss": 0.3696, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.8551724137931034, |
|
"grad_norm": 0.2719421088695526, |
|
"learning_rate": 8.810679611650485e-05, |
|
"loss": 0.2346, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.864367816091954, |
|
"grad_norm": 0.5412010550498962, |
|
"learning_rate": 8.737864077669901e-05, |
|
"loss": 0.6339, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.873563218390805, |
|
"grad_norm": 0.32390540838241577, |
|
"learning_rate": 8.665048543689319e-05, |
|
"loss": 0.4181, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.882758620689655, |
|
"grad_norm": 0.3090226650238037, |
|
"learning_rate": 8.592233009708737e-05, |
|
"loss": 0.3762, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.891954022988506, |
|
"grad_norm": 0.49264782667160034, |
|
"learning_rate": 8.519417475728155e-05, |
|
"loss": 0.7096, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.901149425287356, |
|
"grad_norm": 0.4478220045566559, |
|
"learning_rate": 8.446601941747571e-05, |
|
"loss": 0.4978, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.910344827586207, |
|
"grad_norm": 0.570834755897522, |
|
"learning_rate": 8.373786407766989e-05, |
|
"loss": 0.7979, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.9195402298850572, |
|
"grad_norm": 0.3681240975856781, |
|
"learning_rate": 8.300970873786407e-05, |
|
"loss": 0.4216, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.928735632183908, |
|
"grad_norm": 0.3753468096256256, |
|
"learning_rate": 8.228155339805825e-05, |
|
"loss": 0.3754, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.9379310344827587, |
|
"grad_norm": 0.5123324990272522, |
|
"learning_rate": 8.155339805825241e-05, |
|
"loss": 0.703, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.947126436781609, |
|
"grad_norm": 0.46749231219291687, |
|
"learning_rate": 8.082524271844659e-05, |
|
"loss": 0.5047, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.9563218390804598, |
|
"grad_norm": 0.39987900853157043, |
|
"learning_rate": 8.009708737864077e-05, |
|
"loss": 0.4885, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.9655172413793105, |
|
"grad_norm": 0.36407917737960815, |
|
"learning_rate": 7.936893203883495e-05, |
|
"loss": 0.3912, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.974712643678161, |
|
"grad_norm": 0.4798453152179718, |
|
"learning_rate": 7.864077669902911e-05, |
|
"loss": 0.5546, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.9839080459770115, |
|
"grad_norm": 0.5728134512901306, |
|
"learning_rate": 7.791262135922329e-05, |
|
"loss": 0.6081, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.9931034482758623, |
|
"grad_norm": 0.5350143909454346, |
|
"learning_rate": 7.718446601941747e-05, |
|
"loss": 0.5677, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.6480472087860107, |
|
"learning_rate": 7.645631067961165e-05, |
|
"loss": 0.7052, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.3702210187911987, |
|
"eval_runtime": 70.8873, |
|
"eval_samples_per_second": 4.683, |
|
"eval_steps_per_second": 2.342, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 3.0091954022988507, |
|
"grad_norm": 0.25629866123199463, |
|
"learning_rate": 7.572815533980581e-05, |
|
"loss": 0.2765, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 3.018390804597701, |
|
"grad_norm": 0.27169641852378845, |
|
"learning_rate": 7.5e-05, |
|
"loss": 0.2572, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 3.027586206896552, |
|
"grad_norm": 0.32282203435897827, |
|
"learning_rate": 7.427184466019417e-05, |
|
"loss": 0.3032, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.036781609195402, |
|
"grad_norm": 0.3269301950931549, |
|
"learning_rate": 7.354368932038834e-05, |
|
"loss": 0.3195, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 3.045977011494253, |
|
"grad_norm": 0.4299718141555786, |
|
"learning_rate": 7.281553398058252e-05, |
|
"loss": 0.431, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 3.0551724137931036, |
|
"grad_norm": 0.4134504795074463, |
|
"learning_rate": 7.20873786407767e-05, |
|
"loss": 0.4176, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 3.064367816091954, |
|
"grad_norm": 0.3505900204181671, |
|
"learning_rate": 7.135922330097087e-05, |
|
"loss": 0.2812, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 3.0735632183908046, |
|
"grad_norm": 0.3282555341720581, |
|
"learning_rate": 7.063106796116504e-05, |
|
"loss": 0.3005, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.0827586206896553, |
|
"grad_norm": 0.5910685062408447, |
|
"learning_rate": 6.990291262135922e-05, |
|
"loss": 0.6528, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 3.0919540229885056, |
|
"grad_norm": 0.5548660159111023, |
|
"learning_rate": 6.91747572815534e-05, |
|
"loss": 0.5259, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 3.1011494252873564, |
|
"grad_norm": 0.35125166177749634, |
|
"learning_rate": 6.844660194174757e-05, |
|
"loss": 0.3435, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 3.110344827586207, |
|
"grad_norm": 0.3747900426387787, |
|
"learning_rate": 6.771844660194174e-05, |
|
"loss": 0.3145, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 3.1195402298850574, |
|
"grad_norm": 0.4187811613082886, |
|
"learning_rate": 6.699029126213592e-05, |
|
"loss": 0.4995, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.128735632183908, |
|
"grad_norm": 0.5449840426445007, |
|
"learning_rate": 6.62621359223301e-05, |
|
"loss": 0.5205, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 3.1379310344827585, |
|
"grad_norm": 0.5767033100128174, |
|
"learning_rate": 6.553398058252426e-05, |
|
"loss": 0.7207, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 3.147126436781609, |
|
"grad_norm": 0.3340611457824707, |
|
"learning_rate": 6.480582524271844e-05, |
|
"loss": 0.3074, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 3.15632183908046, |
|
"grad_norm": 0.4297197759151459, |
|
"learning_rate": 6.407766990291262e-05, |
|
"loss": 0.4564, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 3.1655172413793102, |
|
"grad_norm": 0.39158904552459717, |
|
"learning_rate": 6.334951456310678e-05, |
|
"loss": 0.2937, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.174712643678161, |
|
"grad_norm": 0.3491695523262024, |
|
"learning_rate": 6.262135922330096e-05, |
|
"loss": 0.2895, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 3.1839080459770113, |
|
"grad_norm": 0.4597693979740143, |
|
"learning_rate": 6.189320388349514e-05, |
|
"loss": 0.4943, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 3.193103448275862, |
|
"grad_norm": 0.6463683843612671, |
|
"learning_rate": 6.11650485436893e-05, |
|
"loss": 0.6489, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 3.2022988505747128, |
|
"grad_norm": 0.4894922077655792, |
|
"learning_rate": 6.0436893203883485e-05, |
|
"loss": 0.524, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 3.211494252873563, |
|
"grad_norm": 0.463912695646286, |
|
"learning_rate": 5.9708737864077663e-05, |
|
"loss": 0.4044, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.220689655172414, |
|
"grad_norm": 0.3212607204914093, |
|
"learning_rate": 5.8980582524271835e-05, |
|
"loss": 0.3002, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 3.2298850574712645, |
|
"grad_norm": 0.33345821499824524, |
|
"learning_rate": 5.8252427184466014e-05, |
|
"loss": 0.2962, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 3.239080459770115, |
|
"grad_norm": 0.30228373408317566, |
|
"learning_rate": 5.7524271844660186e-05, |
|
"loss": 0.2689, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 3.2482758620689656, |
|
"grad_norm": 0.3870169520378113, |
|
"learning_rate": 5.6796116504854364e-05, |
|
"loss": 0.3696, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 3.2574712643678163, |
|
"grad_norm": 0.5785714983940125, |
|
"learning_rate": 5.6067961165048536e-05, |
|
"loss": 0.5223, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.2666666666666666, |
|
"grad_norm": 0.40553051233291626, |
|
"learning_rate": 5.5339805825242715e-05, |
|
"loss": 0.4251, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 3.2758620689655173, |
|
"grad_norm": 0.3977018892765045, |
|
"learning_rate": 5.461165048543689e-05, |
|
"loss": 0.3159, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 3.2850574712643676, |
|
"grad_norm": 0.3914712071418762, |
|
"learning_rate": 5.3883495145631065e-05, |
|
"loss": 0.3273, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 3.2942528735632184, |
|
"grad_norm": 0.46192315220832825, |
|
"learning_rate": 5.315533980582524e-05, |
|
"loss": 0.4017, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 3.303448275862069, |
|
"grad_norm": 0.24280714988708496, |
|
"learning_rate": 5.2427184466019416e-05, |
|
"loss": 0.1976, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.3126436781609194, |
|
"grad_norm": 0.6448668241500854, |
|
"learning_rate": 5.169902912621359e-05, |
|
"loss": 0.6144, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 3.32183908045977, |
|
"grad_norm": 0.5141739249229431, |
|
"learning_rate": 5.0970873786407766e-05, |
|
"loss": 0.4446, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 3.3310344827586205, |
|
"grad_norm": 0.376054048538208, |
|
"learning_rate": 5.024271844660194e-05, |
|
"loss": 0.4357, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 3.340229885057471, |
|
"grad_norm": 0.40308278799057007, |
|
"learning_rate": 4.951456310679612e-05, |
|
"loss": 0.4151, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.349425287356322, |
|
"grad_norm": 0.5300136208534241, |
|
"learning_rate": 4.878640776699029e-05, |
|
"loss": 0.4244, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.3586206896551722, |
|
"grad_norm": 0.4713798463344574, |
|
"learning_rate": 4.805825242718447e-05, |
|
"loss": 0.4275, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 3.367816091954023, |
|
"grad_norm": 0.4588351249694824, |
|
"learning_rate": 4.733009708737864e-05, |
|
"loss": 0.4849, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 3.3770114942528737, |
|
"grad_norm": 0.5958248376846313, |
|
"learning_rate": 4.6601941747572804e-05, |
|
"loss": 0.6822, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 3.386206896551724, |
|
"grad_norm": 0.6061305403709412, |
|
"learning_rate": 4.587378640776698e-05, |
|
"loss": 0.5434, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 3.3954022988505748, |
|
"grad_norm": 0.36690884828567505, |
|
"learning_rate": 4.5145631067961155e-05, |
|
"loss": 0.2909, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.4045977011494255, |
|
"grad_norm": 0.39009371399879456, |
|
"learning_rate": 4.4417475728155334e-05, |
|
"loss": 0.3972, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 3.413793103448276, |
|
"grad_norm": 0.3522622585296631, |
|
"learning_rate": 4.3689320388349505e-05, |
|
"loss": 0.3774, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.4229885057471265, |
|
"grad_norm": 0.3161761164665222, |
|
"learning_rate": 4.2961165048543684e-05, |
|
"loss": 0.2649, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 3.432183908045977, |
|
"grad_norm": 0.41937097907066345, |
|
"learning_rate": 4.2233009708737856e-05, |
|
"loss": 0.3484, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.4413793103448276, |
|
"grad_norm": 0.5076631307601929, |
|
"learning_rate": 4.1504854368932035e-05, |
|
"loss": 0.6585, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.4505747126436783, |
|
"grad_norm": 0.3357332944869995, |
|
"learning_rate": 4.0776699029126206e-05, |
|
"loss": 0.2972, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.4597701149425286, |
|
"grad_norm": 0.4982966482639313, |
|
"learning_rate": 4.0048543689320385e-05, |
|
"loss": 0.4164, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 3.4689655172413794, |
|
"grad_norm": 0.35467639565467834, |
|
"learning_rate": 3.932038834951456e-05, |
|
"loss": 0.3142, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.4781609195402297, |
|
"grad_norm": 0.5145273804664612, |
|
"learning_rate": 3.8592233009708736e-05, |
|
"loss": 0.5546, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 3.4873563218390804, |
|
"grad_norm": 0.3629278242588043, |
|
"learning_rate": 3.786407766990291e-05, |
|
"loss": 0.3203, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.496551724137931, |
|
"grad_norm": 0.38538166880607605, |
|
"learning_rate": 3.7135922330097086e-05, |
|
"loss": 0.3388, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 3.5057471264367814, |
|
"grad_norm": 0.455674946308136, |
|
"learning_rate": 3.640776699029126e-05, |
|
"loss": 0.3871, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 3.514942528735632, |
|
"grad_norm": 0.44031426310539246, |
|
"learning_rate": 3.5679611650485437e-05, |
|
"loss": 0.3418, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 3.524137931034483, |
|
"grad_norm": 0.39968228340148926, |
|
"learning_rate": 3.495145631067961e-05, |
|
"loss": 0.367, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.533333333333333, |
|
"grad_norm": 0.46408310532569885, |
|
"learning_rate": 3.422330097087379e-05, |
|
"loss": 0.4872, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.542528735632184, |
|
"grad_norm": 0.5868334174156189, |
|
"learning_rate": 3.349514563106796e-05, |
|
"loss": 0.6086, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 3.5517241379310347, |
|
"grad_norm": 0.39106428623199463, |
|
"learning_rate": 3.276699029126213e-05, |
|
"loss": 0.311, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.560919540229885, |
|
"grad_norm": 0.35964855551719666, |
|
"learning_rate": 3.203883495145631e-05, |
|
"loss": 0.2857, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.5701149425287357, |
|
"grad_norm": 0.3140401840209961, |
|
"learning_rate": 3.131067961165048e-05, |
|
"loss": 0.2487, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 3.5793103448275865, |
|
"grad_norm": 0.42587795853614807, |
|
"learning_rate": 3.058252427184465e-05, |
|
"loss": 0.3503, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.5885057471264368, |
|
"grad_norm": 0.4712708294391632, |
|
"learning_rate": 2.9854368932038832e-05, |
|
"loss": 0.4098, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 3.5977011494252875, |
|
"grad_norm": 0.42317089438438416, |
|
"learning_rate": 2.9126213592233007e-05, |
|
"loss": 0.3544, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.606896551724138, |
|
"grad_norm": 0.5408965349197388, |
|
"learning_rate": 2.8398058252427182e-05, |
|
"loss": 0.4863, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 3.6160919540229886, |
|
"grad_norm": 0.45792847871780396, |
|
"learning_rate": 2.7669902912621357e-05, |
|
"loss": 0.4181, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 3.625287356321839, |
|
"grad_norm": 0.5969738364219666, |
|
"learning_rate": 2.6941747572815533e-05, |
|
"loss": 0.5706, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.6344827586206896, |
|
"grad_norm": 0.455818235874176, |
|
"learning_rate": 2.6213592233009708e-05, |
|
"loss": 0.3613, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.6436781609195403, |
|
"grad_norm": 0.3493446111679077, |
|
"learning_rate": 2.5485436893203883e-05, |
|
"loss": 0.3446, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 3.6528735632183906, |
|
"grad_norm": 0.3833727538585663, |
|
"learning_rate": 2.475728155339806e-05, |
|
"loss": 0.2537, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 3.6620689655172414, |
|
"grad_norm": 0.4627190828323364, |
|
"learning_rate": 2.4029126213592234e-05, |
|
"loss": 0.392, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 3.671264367816092, |
|
"grad_norm": 0.5414885878562927, |
|
"learning_rate": 2.3300970873786402e-05, |
|
"loss": 0.5864, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.6804597701149424, |
|
"grad_norm": 0.3071203827857971, |
|
"learning_rate": 2.2572815533980577e-05, |
|
"loss": 0.2578, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 3.689655172413793, |
|
"grad_norm": 0.3995971381664276, |
|
"learning_rate": 2.1844660194174753e-05, |
|
"loss": 0.3063, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 3.698850574712644, |
|
"grad_norm": 0.5993015170097351, |
|
"learning_rate": 2.1116504854368928e-05, |
|
"loss": 0.6541, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 3.708045977011494, |
|
"grad_norm": 0.3946886658668518, |
|
"learning_rate": 2.0388349514563103e-05, |
|
"loss": 0.4038, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 3.717241379310345, |
|
"grad_norm": 0.513338565826416, |
|
"learning_rate": 1.966019417475728e-05, |
|
"loss": 0.5935, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.7264367816091957, |
|
"grad_norm": 0.49390944838523865, |
|
"learning_rate": 1.8932038834951454e-05, |
|
"loss": 0.3909, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 3.735632183908046, |
|
"grad_norm": 0.4437660574913025, |
|
"learning_rate": 1.820388349514563e-05, |
|
"loss": 0.3078, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 3.7448275862068967, |
|
"grad_norm": 0.515243649482727, |
|
"learning_rate": 1.7475728155339804e-05, |
|
"loss": 0.4326, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 3.754022988505747, |
|
"grad_norm": 0.5932897925376892, |
|
"learning_rate": 1.674757281553398e-05, |
|
"loss": 0.4829, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 3.7632183908045977, |
|
"grad_norm": 0.36783942580223083, |
|
"learning_rate": 1.6019417475728155e-05, |
|
"loss": 0.3196, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.772413793103448, |
|
"grad_norm": 0.6258606910705566, |
|
"learning_rate": 1.5291262135922327e-05, |
|
"loss": 0.6144, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.781609195402299, |
|
"grad_norm": 0.4248901605606079, |
|
"learning_rate": 1.4563106796116503e-05, |
|
"loss": 0.4448, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 3.7908045977011495, |
|
"grad_norm": 0.5265323519706726, |
|
"learning_rate": 1.3834951456310679e-05, |
|
"loss": 0.4877, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.48001861572265625, |
|
"learning_rate": 1.3106796116504854e-05, |
|
"loss": 0.4305, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 3.8091954022988506, |
|
"grad_norm": 0.25024116039276123, |
|
"learning_rate": 1.237864077669903e-05, |
|
"loss": 0.2016, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.8183908045977013, |
|
"grad_norm": 0.4971444308757782, |
|
"learning_rate": 1.1650485436893201e-05, |
|
"loss": 0.4237, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.8275862068965516, |
|
"grad_norm": 0.3394944965839386, |
|
"learning_rate": 1.0922330097087376e-05, |
|
"loss": 0.3308, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.8367816091954023, |
|
"grad_norm": 0.3994314670562744, |
|
"learning_rate": 1.0194174757281552e-05, |
|
"loss": 0.3837, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 3.845977011494253, |
|
"grad_norm": 0.3695448338985443, |
|
"learning_rate": 9.466019417475727e-06, |
|
"loss": 0.2975, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 3.8551724137931034, |
|
"grad_norm": 0.4546396732330322, |
|
"learning_rate": 8.737864077669902e-06, |
|
"loss": 0.4375, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.864367816091954, |
|
"grad_norm": 0.45976778864860535, |
|
"learning_rate": 8.009708737864077e-06, |
|
"loss": 0.3612, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 3.873563218390805, |
|
"grad_norm": 0.40973135828971863, |
|
"learning_rate": 7.281553398058252e-06, |
|
"loss": 0.4075, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 3.882758620689655, |
|
"grad_norm": 0.3295038938522339, |
|
"learning_rate": 6.553398058252427e-06, |
|
"loss": 0.2314, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 3.891954022988506, |
|
"grad_norm": 0.37351667881011963, |
|
"learning_rate": 5.8252427184466006e-06, |
|
"loss": 0.3237, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 3.901149425287356, |
|
"grad_norm": 0.5769475102424622, |
|
"learning_rate": 5.097087378640776e-06, |
|
"loss": 0.5066, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.910344827586207, |
|
"grad_norm": 0.4906369745731354, |
|
"learning_rate": 4.368932038834951e-06, |
|
"loss": 0.4666, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 3.9195402298850572, |
|
"grad_norm": 0.2950834035873413, |
|
"learning_rate": 3.640776699029126e-06, |
|
"loss": 0.2048, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 3.928735632183908, |
|
"grad_norm": 0.3957897126674652, |
|
"learning_rate": 2.9126213592233003e-06, |
|
"loss": 0.3443, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 3.9379310344827587, |
|
"grad_norm": 0.4838825464248657, |
|
"learning_rate": 2.1844660194174755e-06, |
|
"loss": 0.3792, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 3.947126436781609, |
|
"grad_norm": 0.3165542781352997, |
|
"learning_rate": 1.4563106796116501e-06, |
|
"loss": 0.4011, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.9563218390804598, |
|
"grad_norm": 0.5820279717445374, |
|
"learning_rate": 7.281553398058251e-07, |
|
"loss": 0.5933, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 3.9655172413793105, |
|
"grad_norm": 0.649249792098999, |
|
"learning_rate": 0.0, |
|
"loss": 0.6057, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 3.9655172413793105, |
|
"eval_loss": 1.4152063131332397, |
|
"eval_runtime": 70.5317, |
|
"eval_samples_per_second": 4.707, |
|
"eval_steps_per_second": 2.354, |
|
"step": 432 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 432, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.618106035999867e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|