{ "best_metric": 0.07254856824874878, "best_model_checkpoint": "./teapotllm/checkpoint-1224", "epoch": 20.0, "eval_steps": 500, "global_step": 8160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.07505040615797043, "learning_rate": 4.75e-05, "loss": 0.0953, "step": 408 }, { "epoch": 1.0, "eval_loss": 0.07464946806430817, "eval_runtime": 4.9963, "eval_samples_per_second": 45.034, "eval_steps_per_second": 5.804, "step": 408 }, { "epoch": 2.0, "grad_norm": 0.3880476951599121, "learning_rate": 4.5e-05, "loss": 0.0706, "step": 816 }, { "epoch": 2.0, "eval_loss": 0.0732811763882637, "eval_runtime": 4.9711, "eval_samples_per_second": 45.262, "eval_steps_per_second": 5.834, "step": 816 }, { "epoch": 3.0, "grad_norm": 0.514928936958313, "learning_rate": 4.25e-05, "loss": 0.0579, "step": 1224 }, { "epoch": 3.0, "eval_loss": 0.07254856824874878, "eval_runtime": 4.9773, "eval_samples_per_second": 45.205, "eval_steps_per_second": 5.826, "step": 1224 }, { "epoch": 4.0, "grad_norm": 0.4975501000881195, "learning_rate": 4e-05, "loss": 0.0493, "step": 1632 }, { "epoch": 4.0, "eval_loss": 0.07370911538600922, "eval_runtime": 4.9687, "eval_samples_per_second": 45.284, "eval_steps_per_second": 5.837, "step": 1632 }, { "epoch": 5.0, "grad_norm": 0.588830292224884, "learning_rate": 3.7500000000000003e-05, "loss": 0.0427, "step": 2040 }, { "epoch": 5.0, "eval_loss": 0.07484618574380875, "eval_runtime": 4.9726, "eval_samples_per_second": 45.248, "eval_steps_per_second": 5.832, "step": 2040 }, { "epoch": 6.0, "grad_norm": 0.07073836028575897, "learning_rate": 3.5e-05, "loss": 0.0376, "step": 2448 }, { "epoch": 6.0, "eval_loss": 0.07779362797737122, "eval_runtime": 4.9773, "eval_samples_per_second": 45.205, "eval_steps_per_second": 5.826, "step": 2448 }, { "epoch": 7.0, "grad_norm": 0.638589084148407, "learning_rate": 3.2500000000000004e-05, "loss": 0.0324, "step": 2856 }, { "epoch": 7.0, "eval_loss": 0.07828149944543839, "eval_runtime": 4.984, "eval_samples_per_second": 45.145, "eval_steps_per_second": 5.819, "step": 2856 }, { "epoch": 8.0, "grad_norm": 0.11211636662483215, "learning_rate": 3e-05, "loss": 0.0291, "step": 3264 }, { "epoch": 8.0, "eval_loss": 0.08292075246572495, "eval_runtime": 4.9953, "eval_samples_per_second": 45.042, "eval_steps_per_second": 5.805, "step": 3264 }, { "epoch": 9.0, "grad_norm": 0.2681402266025543, "learning_rate": 2.7500000000000004e-05, "loss": 0.0264, "step": 3672 }, { "epoch": 9.0, "eval_loss": 0.08268510550260544, "eval_runtime": 4.9786, "eval_samples_per_second": 45.194, "eval_steps_per_second": 5.825, "step": 3672 }, { "epoch": 10.0, "grad_norm": 0.26614490151405334, "learning_rate": 2.5e-05, "loss": 0.023, "step": 4080 }, { "epoch": 10.0, "eval_loss": 0.08475232124328613, "eval_runtime": 4.9942, "eval_samples_per_second": 45.052, "eval_steps_per_second": 5.807, "step": 4080 }, { "epoch": 11.0, "grad_norm": 0.05123787373304367, "learning_rate": 2.25e-05, "loss": 0.0217, "step": 4488 }, { "epoch": 11.0, "eval_loss": 0.08677990734577179, "eval_runtime": 5.0004, "eval_samples_per_second": 44.996, "eval_steps_per_second": 5.8, "step": 4488 }, { "epoch": 12.0, "grad_norm": 0.8446316719055176, "learning_rate": 2e-05, "loss": 0.0196, "step": 4896 }, { "epoch": 12.0, "eval_loss": 0.08961891382932663, "eval_runtime": 4.9782, "eval_samples_per_second": 45.197, "eval_steps_per_second": 5.825, "step": 4896 }, { "epoch": 13.0, "grad_norm": 0.32860517501831055, "learning_rate": 1.75e-05, "loss": 0.0178, "step": 5304 }, { "epoch": 13.0, "eval_loss": 0.09325850754976273, "eval_runtime": 4.9928, "eval_samples_per_second": 45.065, "eval_steps_per_second": 5.808, "step": 5304 }, { "epoch": 14.0, "grad_norm": 0.9496984481811523, "learning_rate": 1.5e-05, "loss": 0.0167, "step": 5712 }, { "epoch": 14.0, "eval_loss": 0.09426513314247131, "eval_runtime": 4.9809, "eval_samples_per_second": 45.173, "eval_steps_per_second": 5.822, "step": 5712 }, { "epoch": 15.0, "grad_norm": 0.056426361203193665, "learning_rate": 1.25e-05, "loss": 0.016, "step": 6120 }, { "epoch": 15.0, "eval_loss": 0.09544987976551056, "eval_runtime": 4.9829, "eval_samples_per_second": 45.155, "eval_steps_per_second": 5.82, "step": 6120 }, { "epoch": 16.0, "grad_norm": 0.05803034454584122, "learning_rate": 1e-05, "loss": 0.0147, "step": 6528 }, { "epoch": 16.0, "eval_loss": 0.09645407646894455, "eval_runtime": 4.9673, "eval_samples_per_second": 45.296, "eval_steps_per_second": 5.838, "step": 6528 }, { "epoch": 17.0, "grad_norm": 0.1461056023836136, "learning_rate": 7.5e-06, "loss": 0.0143, "step": 6936 }, { "epoch": 17.0, "eval_loss": 0.09935282170772552, "eval_runtime": 4.9798, "eval_samples_per_second": 45.182, "eval_steps_per_second": 5.824, "step": 6936 }, { "epoch": 18.0, "grad_norm": 0.007102633360773325, "learning_rate": 5e-06, "loss": 0.0136, "step": 7344 }, { "epoch": 18.0, "eval_loss": 0.1016102060675621, "eval_runtime": 4.9718, "eval_samples_per_second": 45.255, "eval_steps_per_second": 5.833, "step": 7344 }, { "epoch": 19.0, "grad_norm": 0.011653387919068336, "learning_rate": 2.5e-06, "loss": 0.0135, "step": 7752 }, { "epoch": 19.0, "eval_loss": 0.10058598965406418, "eval_runtime": 4.9846, "eval_samples_per_second": 45.139, "eval_steps_per_second": 5.818, "step": 7752 }, { "epoch": 20.0, "grad_norm": 0.7057294249534607, "learning_rate": 0.0, "loss": 0.013, "step": 8160 }, { "epoch": 20.0, "eval_loss": 0.10097935050725937, "eval_runtime": 4.9778, "eval_samples_per_second": 45.201, "eval_steps_per_second": 5.826, "step": 8160 } ], "logging_steps": 500, "max_steps": 8160, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.11615448449024e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }