{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04032258064516129, "grad_norm": 1.8761260509490967, "learning_rate": 1.967741935483871e-05, "loss": 1.3894, "num_tokens": 583069.0, "step": 5 }, { "epoch": 0.08064516129032258, "grad_norm": 1.5366759300231934, "learning_rate": 1.9274193548387097e-05, "loss": 1.2545, "num_tokens": 1166377.0, "step": 10 }, { "epoch": 0.12096774193548387, "grad_norm": 1.5728304386138916, "learning_rate": 1.8870967741935487e-05, "loss": 1.2989, "num_tokens": 1764845.0, "step": 15 }, { "epoch": 0.16129032258064516, "grad_norm": 1.7080962657928467, "learning_rate": 1.8467741935483873e-05, "loss": 1.2884, "num_tokens": 2359985.0, "step": 20 }, { "epoch": 0.20161290322580644, "grad_norm": 1.4729899168014526, "learning_rate": 1.806451612903226e-05, "loss": 1.2504, "num_tokens": 2938303.0, "step": 25 }, { "epoch": 0.24193548387096775, "grad_norm": 1.5811930894851685, "learning_rate": 1.7661290322580645e-05, "loss": 1.2157, "num_tokens": 3536824.0, "step": 30 }, { "epoch": 0.28225806451612906, "grad_norm": 1.5144275426864624, "learning_rate": 1.7258064516129035e-05, "loss": 1.1586, "num_tokens": 4106428.0, "step": 35 }, { "epoch": 0.3225806451612903, "grad_norm": 1.2803715467453003, "learning_rate": 1.685483870967742e-05, "loss": 1.2279, "num_tokens": 4713410.0, "step": 40 }, { "epoch": 0.3629032258064516, "grad_norm": 1.0752050876617432, "learning_rate": 1.6451612903225807e-05, "loss": 1.208, "num_tokens": 5302842.0, "step": 45 }, { "epoch": 0.4032258064516129, "grad_norm": 0.8806188106536865, "learning_rate": 1.6048387096774193e-05, "loss": 1.1532, "num_tokens": 5883277.0, "step": 50 }, { "epoch": 0.4435483870967742, "grad_norm": 0.7167242169380188, "learning_rate": 1.5645161290322583e-05, "loss": 1.16, "num_tokens": 6457162.0, "step": 55 }, { "epoch": 0.4838709677419355, "grad_norm": 0.6805269122123718, "learning_rate": 1.5241935483870969e-05, "loss": 1.1472, "num_tokens": 7013145.0, "step": 60 }, { "epoch": 0.5241935483870968, "grad_norm": 0.7752795219421387, "learning_rate": 1.4838709677419357e-05, "loss": 1.1305, "num_tokens": 7621530.0, "step": 65 }, { "epoch": 0.5645161290322581, "grad_norm": 0.6784509420394897, "learning_rate": 1.4435483870967743e-05, "loss": 1.1395, "num_tokens": 8194938.0, "step": 70 }, { "epoch": 0.6048387096774194, "grad_norm": 0.6889496445655823, "learning_rate": 1.4032258064516131e-05, "loss": 1.132, "num_tokens": 8800144.0, "step": 75 }, { "epoch": 0.6451612903225806, "grad_norm": 0.7251303195953369, "learning_rate": 1.3629032258064517e-05, "loss": 1.1685, "num_tokens": 9408277.0, "step": 80 }, { "epoch": 0.6854838709677419, "grad_norm": 0.6782833337783813, "learning_rate": 1.3225806451612903e-05, "loss": 1.1287, "num_tokens": 10000739.0, "step": 85 }, { "epoch": 0.7258064516129032, "grad_norm": 0.6917558312416077, "learning_rate": 1.2822580645161291e-05, "loss": 1.1286, "num_tokens": 10586681.0, "step": 90 }, { "epoch": 0.7661290322580645, "grad_norm": 0.693255603313446, "learning_rate": 1.2419354838709679e-05, "loss": 1.1328, "num_tokens": 11185064.0, "step": 95 }, { "epoch": 0.8064516129032258, "grad_norm": 0.6752443313598633, "learning_rate": 1.2016129032258067e-05, "loss": 1.1341, "step": 100 }, { "epoch": 0.8064516129032258, "eval_loss": 1.121854543685913, "eval_num_tokens": 11776617.0, "eval_runtime": 0.7163, "eval_samples_per_second": 279.2, "eval_steps_per_second": 5.584, "step": 100 }, { "epoch": 0.8467741935483871, "grad_norm": 0.6200473308563232, "learning_rate": 1.1612903225806453e-05, "loss": 1.0986, "num_tokens": 12370086.0, "step": 105 }, { "epoch": 0.8870967741935484, "grad_norm": 0.6284494996070862, "learning_rate": 1.1209677419354839e-05, "loss": 1.1591, "num_tokens": 12949436.0, "step": 110 }, { "epoch": 0.9274193548387096, "grad_norm": 0.6932438611984253, "learning_rate": 1.0806451612903225e-05, "loss": 1.1478, "num_tokens": 13537898.0, "step": 115 }, { "epoch": 0.967741935483871, "grad_norm": 0.6768374443054199, "learning_rate": 1.0403225806451613e-05, "loss": 1.1116, "num_tokens": 14126751.0, "step": 120 }, { "epoch": 1.0080645161290323, "grad_norm": 0.7337521910667419, "learning_rate": 1e-05, "loss": 1.0969, "num_tokens": 14734884.0, "step": 125 }, { "epoch": 1.0483870967741935, "grad_norm": 0.7160388827323914, "learning_rate": 9.596774193548389e-06, "loss": 1.0158, "num_tokens": 15308122.0, "step": 130 }, { "epoch": 1.0887096774193548, "grad_norm": 0.6945542097091675, "learning_rate": 9.193548387096775e-06, "loss": 0.9843, "num_tokens": 15893652.0, "step": 135 }, { "epoch": 1.129032258064516, "grad_norm": 0.6891147494316101, "learning_rate": 8.790322580645163e-06, "loss": 1.0347, "num_tokens": 16479176.0, "step": 140 }, { "epoch": 1.1693548387096775, "grad_norm": 0.6116432547569275, "learning_rate": 8.387096774193549e-06, "loss": 0.9615, "num_tokens": 17068227.0, "step": 145 }, { "epoch": 1.2096774193548387, "grad_norm": 0.6009991765022278, "learning_rate": 7.983870967741935e-06, "loss": 0.9883, "num_tokens": 17648475.0, "step": 150 }, { "epoch": 1.25, "grad_norm": 0.6140655875205994, "learning_rate": 7.580645161290323e-06, "loss": 0.9833, "num_tokens": 18252289.0, "step": 155 }, { "epoch": 1.2903225806451613, "grad_norm": 0.5997236967086792, "learning_rate": 7.177419354838711e-06, "loss": 0.9659, "num_tokens": 18842881.0, "step": 160 }, { "epoch": 1.3306451612903225, "grad_norm": 0.583691418170929, "learning_rate": 6.774193548387097e-06, "loss": 1.0074, "num_tokens": 19423540.0, "step": 165 }, { "epoch": 1.370967741935484, "grad_norm": 0.617231011390686, "learning_rate": 6.370967741935485e-06, "loss": 1.0354, "num_tokens": 20017275.0, "step": 170 }, { "epoch": 1.4112903225806452, "grad_norm": 0.6525187492370605, "learning_rate": 5.967741935483872e-06, "loss": 1.0141, "num_tokens": 20592466.0, "step": 175 }, { "epoch": 1.4516129032258065, "grad_norm": 0.6343702673912048, "learning_rate": 5.564516129032258e-06, "loss": 1.0358, "num_tokens": 21170579.0, "step": 180 }, { "epoch": 1.4919354838709677, "grad_norm": 0.621397078037262, "learning_rate": 5.161290322580646e-06, "loss": 0.982, "num_tokens": 21779118.0, "step": 185 }, { "epoch": 1.532258064516129, "grad_norm": 0.6357585191726685, "learning_rate": 4.758064516129033e-06, "loss": 0.9599, "num_tokens": 22337296.0, "step": 190 }, { "epoch": 1.5725806451612905, "grad_norm": 0.6274889707565308, "learning_rate": 4.35483870967742e-06, "loss": 1.0216, "num_tokens": 22933384.0, "step": 195 }, { "epoch": 1.6129032258064515, "grad_norm": 0.6913103461265564, "learning_rate": 3.951612903225807e-06, "loss": 0.9954, "step": 200 }, { "epoch": 1.6129032258064515, "eval_loss": 1.1182941198349, "eval_num_tokens": 23494529.0, "eval_runtime": 0.6682, "eval_samples_per_second": 299.331, "eval_steps_per_second": 5.987, "step": 200 }, { "epoch": 1.653225806451613, "grad_norm": 0.6222682595252991, "learning_rate": 3.548387096774194e-06, "loss": 1.0195, "num_tokens": 24073440.0, "step": 205 }, { "epoch": 1.6935483870967742, "grad_norm": 0.6264064311981201, "learning_rate": 3.145161290322581e-06, "loss": 1.0458, "num_tokens": 24679112.0, "step": 210 }, { "epoch": 1.7338709677419355, "grad_norm": 0.629045844078064, "learning_rate": 2.7419354838709676e-06, "loss": 1.028, "num_tokens": 25286383.0, "step": 215 }, { "epoch": 1.7741935483870968, "grad_norm": 0.575624406337738, "learning_rate": 2.338709677419355e-06, "loss": 0.9633, "num_tokens": 25917151.0, "step": 220 }, { "epoch": 1.814516129032258, "grad_norm": 0.5966957211494446, "learning_rate": 1.935483870967742e-06, "loss": 1.0002, "num_tokens": 26525780.0, "step": 225 }, { "epoch": 1.8548387096774195, "grad_norm": 0.6634185314178467, "learning_rate": 1.5322580645161292e-06, "loss": 0.998, "num_tokens": 27096606.0, "step": 230 }, { "epoch": 1.8951612903225805, "grad_norm": 0.5811392068862915, "learning_rate": 1.1290322580645162e-06, "loss": 1.0284, "num_tokens": 27687166.0, "step": 235 }, { "epoch": 1.935483870967742, "grad_norm": 0.6014962792396545, "learning_rate": 7.258064516129033e-07, "loss": 0.9879, "num_tokens": 28284352.0, "step": 240 }, { "epoch": 1.9758064516129032, "grad_norm": 0.5961019396781921, "learning_rate": 3.2258064516129035e-07, "loss": 0.9683, "num_tokens": 28878645.0, "step": 245 }, { "epoch": 2.0, "num_tokens": 29227434.0, "step": 248, "total_flos": 4.8695749825160806e+17, "train_loss": 1.0886195955737945, "train_runtime": 315.8591, "train_samples_per_second": 100.083, "train_steps_per_second": 0.785 } ], "logging_steps": 5, "max_steps": 248, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.8695749825160806e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }