diff --git a/checkpoints/checkpoint-1000/config.json b/checkpoints/checkpoint-1000/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-1000/config.json +++ b/checkpoints/checkpoint-1000/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-1000/model.safetensors b/checkpoints/checkpoint-1000/model.safetensors index 9af699fbec03a8399fd4819a422d41cb32bfcd01..97feb8389baaaf20736a9d6faabf1b06f0bdec96 100644 --- a/checkpoints/checkpoint-1000/model.safetensors +++ b/checkpoints/checkpoint-1000/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5ae997f2cf9eadb57922b94260884615421d73c0edf124fcadd583baa9960a6 -size 176343496 +oid sha256:ecddd7806bbce858d7a9c8f4a49aa06b7376eafb0955ae9642f0388121a2a689 +size 617130824 diff --git a/checkpoints/checkpoint-1000/optimizer.pt b/checkpoints/checkpoint-1000/optimizer.pt index e11bfba47da7c672f2a5187bc338cdbf7a2adefe..4b583e56415151f6a1836cd5e1ecfbe1ed18f055 100644 --- a/checkpoints/checkpoint-1000/optimizer.pt +++ b/checkpoints/checkpoint-1000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b82b36c45fe5fc732e4631b0473258f619c92959baa3cbb2820eb493ff496b54 -size 352735610 +oid sha256:05384fd919d4331c4f62eb3e56ba3d26561bf6ef9ab915ef8c1ddcd165540b84 +size 1234355130 diff --git a/checkpoints/checkpoint-1000/rng_state.pth b/checkpoints/checkpoint-1000/rng_state.pth index 779c9105fc3e9fdfb72fd86806fcab16a7838880..2aa7b3bda41d323c455b214029b8c2e64582fd05 100644 --- a/checkpoints/checkpoint-1000/rng_state.pth +++ b/checkpoints/checkpoint-1000/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9fa03700ce614a8ccdcabc508ac1a77d109fb25e54d40dd1186c765609ba983 +oid sha256:3404e1aa2fa72c55a7ea6b1c303c4b57bf41922fdf8d7c880092c6ac1ab5b6b9 size 14244 diff --git a/checkpoints/checkpoint-1000/trainer_state.json b/checkpoints/checkpoint-1000/trainer_state.json index 975aad71a8b615c11657c1ac6232711e33df6d97..a008a310216941811cd7ba9b2866a58db9c05cf3 100644 --- a/checkpoints/checkpoint-1000/trainer_state.json +++ b/checkpoints/checkpoint-1000/trainer_state.json @@ -11,142 +11,142 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 }, { "epoch": 2.6699029126213594, - "grad_norm": 5.35228157043457, + "grad_norm": 5.231846809387207, "learning_rate": 4.555825242718447e-05, - "loss": 4.1612, + "loss": 4.0662, "step": 550 }, { "epoch": 2.912621359223301, - "grad_norm": 5.287161827087402, + "grad_norm": 5.506278038024902, "learning_rate": 4.51537216828479e-05, - "loss": 4.1322, + "loss": 4.0403, "step": 600 }, { "epoch": 3.1553398058252426, - "grad_norm": 5.2932209968566895, + "grad_norm": 5.410216331481934, "learning_rate": 4.4749190938511334e-05, - "loss": 3.9684, + "loss": 3.8375, "step": 650 }, { "epoch": 3.3980582524271843, - "grad_norm": 5.4360833168029785, + "grad_norm": 5.637699127197266, "learning_rate": 4.434466019417476e-05, - "loss": 3.8933, + "loss": 3.7473, "step": 700 }, { "epoch": 3.6407766990291264, - "grad_norm": 5.5595221519470215, + "grad_norm": 5.575273513793945, "learning_rate": 4.3940129449838194e-05, - "loss": 3.9306, + "loss": 3.7835, "step": 750 }, { "epoch": 3.883495145631068, - "grad_norm": 5.540086269378662, + "grad_norm": 5.737198352813721, "learning_rate": 4.353559870550162e-05, - "loss": 3.9317, + "loss": 3.7919, "step": 800 }, { "epoch": 4.12621359223301, - "grad_norm": 5.567137718200684, + "grad_norm": 5.645532608032227, "learning_rate": 4.313106796116505e-05, - "loss": 3.8135, + "loss": 3.6425, "step": 850 }, { "epoch": 4.368932038834951, - "grad_norm": 6.00346040725708, + "grad_norm": 6.308927059173584, "learning_rate": 4.272653721682848e-05, - "loss": 3.6924, + "loss": 3.4785, "step": 900 }, { "epoch": 4.611650485436893, - "grad_norm": 5.635782718658447, + "grad_norm": 5.863094329833984, "learning_rate": 4.232200647249191e-05, - "loss": 3.7304, + "loss": 3.5268, "step": 950 }, { "epoch": 4.854368932038835, - "grad_norm": 5.9545440673828125, + "grad_norm": 6.120258331298828, "learning_rate": 4.191747572815534e-05, - "loss": 3.7471, + "loss": 3.5296, "step": 1000 } ], @@ -167,7 +167,7 @@ "attributes": {} } }, - "total_flos": 130648375296000.0, + "total_flos": 464353492992000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-1000/training_args.bin b/checkpoints/checkpoint-1000/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-1000/training_args.bin +++ b/checkpoints/checkpoint-1000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/checkpoints/checkpoint-1500/config.json b/checkpoints/checkpoint-1500/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-1500/config.json +++ b/checkpoints/checkpoint-1500/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-1500/model.safetensors b/checkpoints/checkpoint-1500/model.safetensors index 3af468bfe215d65337c8fbcd26fdb17e9730be07..17e027e5abbeaf277050b9fb5bb3eb7ab03563ae 100644 --- a/checkpoints/checkpoint-1500/model.safetensors +++ b/checkpoints/checkpoint-1500/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be299c4059db11d4c5f71f16a3deddb85120abd5b479bf8e8320316f7aae7e16 -size 176343496 +oid sha256:3e9d84a36c4519bf72764d9744d791a4693b054535689bf3648c7d0833ae5356 +size 617130824 diff --git a/checkpoints/checkpoint-1500/optimizer.pt b/checkpoints/checkpoint-1500/optimizer.pt index 3aecc84c94c50fb476055e508f60f9c8c7fa5c66..45f9230f96426c5cbbf19d9d1f6be8448f929475 100644 --- a/checkpoints/checkpoint-1500/optimizer.pt +++ b/checkpoints/checkpoint-1500/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:295f43bb7dbf683a742daa0d0fbb0499cfe10fbb60f183a5953d842c37ce5de5 -size 352735610 +oid sha256:e76d49ca329f71b0d0da9d3ae2c4624ac37ed78688ebf52ff5dee93474eb6e7b +size 1234355130 diff --git a/checkpoints/checkpoint-1500/rng_state.pth b/checkpoints/checkpoint-1500/rng_state.pth index 0400f6ed0f24b62cbe8d622bef90933aa3c858df..99f6240ac564aa437ba58d4279afc4471795bbfc 100644 --- a/checkpoints/checkpoint-1500/rng_state.pth +++ b/checkpoints/checkpoint-1500/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73893aa1beb4ff8ac42b526195db283c1ce3c2d78f82fa2ddd5a616c21116d1f +oid sha256:4f95e2cc3df656ac2c5f1f4c905d818b69371ad7deb474b77aec082625d08cce size 14244 diff --git a/checkpoints/checkpoint-1500/trainer_state.json b/checkpoints/checkpoint-1500/trainer_state.json index 7e3c0d8379f0d876f705b9e65d35e1cac1c9b120..7b78aea21894969a4c75a97bc20fa4fc8dd6dcf2 100644 --- a/checkpoints/checkpoint-1500/trainer_state.json +++ b/checkpoints/checkpoint-1500/trainer_state.json @@ -11,212 +11,212 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 }, { "epoch": 2.6699029126213594, - "grad_norm": 5.35228157043457, + "grad_norm": 5.231846809387207, "learning_rate": 4.555825242718447e-05, - "loss": 4.1612, + "loss": 4.0662, "step": 550 }, { "epoch": 2.912621359223301, - "grad_norm": 5.287161827087402, + "grad_norm": 5.506278038024902, "learning_rate": 4.51537216828479e-05, - "loss": 4.1322, + "loss": 4.0403, "step": 600 }, { "epoch": 3.1553398058252426, - "grad_norm": 5.2932209968566895, + "grad_norm": 5.410216331481934, "learning_rate": 4.4749190938511334e-05, - "loss": 3.9684, + "loss": 3.8375, "step": 650 }, { "epoch": 3.3980582524271843, - "grad_norm": 5.4360833168029785, + "grad_norm": 5.637699127197266, "learning_rate": 4.434466019417476e-05, - "loss": 3.8933, + "loss": 3.7473, "step": 700 }, { "epoch": 3.6407766990291264, - "grad_norm": 5.5595221519470215, + "grad_norm": 5.575273513793945, "learning_rate": 4.3940129449838194e-05, - "loss": 3.9306, + "loss": 3.7835, "step": 750 }, { "epoch": 3.883495145631068, - "grad_norm": 5.540086269378662, + "grad_norm": 5.737198352813721, "learning_rate": 4.353559870550162e-05, - "loss": 3.9317, + "loss": 3.7919, "step": 800 }, { "epoch": 4.12621359223301, - "grad_norm": 5.567137718200684, + "grad_norm": 5.645532608032227, "learning_rate": 4.313106796116505e-05, - "loss": 3.8135, + "loss": 3.6425, "step": 850 }, { "epoch": 4.368932038834951, - "grad_norm": 6.00346040725708, + "grad_norm": 6.308927059173584, "learning_rate": 4.272653721682848e-05, - "loss": 3.6924, + "loss": 3.4785, "step": 900 }, { "epoch": 4.611650485436893, - "grad_norm": 5.635782718658447, + "grad_norm": 5.863094329833984, "learning_rate": 4.232200647249191e-05, - "loss": 3.7304, + "loss": 3.5268, "step": 950 }, { "epoch": 4.854368932038835, - "grad_norm": 5.9545440673828125, + "grad_norm": 6.120258331298828, "learning_rate": 4.191747572815534e-05, - "loss": 3.7471, + "loss": 3.5296, "step": 1000 }, { "epoch": 5.097087378640777, - "grad_norm": 5.696166038513184, + "grad_norm": 6.231515884399414, "learning_rate": 4.1512944983818774e-05, - "loss": 3.6287, + "loss": 3.3802, "step": 1050 }, { "epoch": 5.339805825242719, - "grad_norm": 5.844620704650879, + "grad_norm": 6.211188316345215, "learning_rate": 4.11084142394822e-05, - "loss": 3.5299, + "loss": 3.2148, "step": 1100 }, { "epoch": 5.58252427184466, - "grad_norm": 6.373577117919922, + "grad_norm": 6.8676934242248535, "learning_rate": 4.0703883495145634e-05, - "loss": 3.5285, + "loss": 3.2111, "step": 1150 }, { "epoch": 5.825242718446602, - "grad_norm": 6.488651275634766, + "grad_norm": 6.851133346557617, "learning_rate": 4.029935275080906e-05, - "loss": 3.5789, + "loss": 3.2771, "step": 1200 }, { "epoch": 6.067961165048544, - "grad_norm": 6.170980930328369, + "grad_norm": 6.708765506744385, "learning_rate": 3.9894822006472494e-05, - "loss": 3.494, + "loss": 3.1587, "step": 1250 }, { "epoch": 6.310679611650485, - "grad_norm": 6.387782573699951, + "grad_norm": 7.136553764343262, "learning_rate": 3.949029126213593e-05, - "loss": 3.353, + "loss": 2.9192, "step": 1300 }, { "epoch": 6.553398058252427, - "grad_norm": 6.718175888061523, + "grad_norm": 7.370823383331299, "learning_rate": 3.9085760517799354e-05, - "loss": 3.3521, + "loss": 2.922, "step": 1350 }, { "epoch": 6.796116504854369, - "grad_norm": 6.7870025634765625, + "grad_norm": 7.630361557006836, "learning_rate": 3.868122977346279e-05, - "loss": 3.3658, + "loss": 2.9417, "step": 1400 }, { "epoch": 7.038834951456311, - "grad_norm": 6.513721466064453, + "grad_norm": 7.765511989593506, "learning_rate": 3.827669902912622e-05, - "loss": 3.3301, + "loss": 2.8669, "step": 1450 }, { "epoch": 7.281553398058253, - "grad_norm": 7.0680036544799805, + "grad_norm": 7.85439920425415, "learning_rate": 3.787216828478965e-05, - "loss": 3.1289, + "loss": 2.5412, "step": 1500 } ], @@ -237,7 +237,7 @@ "attributes": {} } }, - "total_flos": 195972562944000.0, + "total_flos": 696530239488000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-1500/training_args.bin b/checkpoints/checkpoint-1500/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-1500/training_args.bin +++ b/checkpoints/checkpoint-1500/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/checkpoints/checkpoint-2000/config.json b/checkpoints/checkpoint-2000/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-2000/config.json +++ b/checkpoints/checkpoint-2000/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-2000/model.safetensors b/checkpoints/checkpoint-2000/model.safetensors index fc8674aefc67b59b0bc080e293d502ecb5b8b78e..d77939930ea6b910d2673a7c80d6eb6bf8080fcd 100644 --- a/checkpoints/checkpoint-2000/model.safetensors +++ b/checkpoints/checkpoint-2000/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31f153b730bb5ccd728c40ab63c17b596f9e9ae82c15275165a9eeb69cc79bf0 -size 176343496 +oid sha256:3139c52e149dceb7821da5478bb894919e6cc2a81a11d980add682dc884a5b76 +size 617130824 diff --git a/checkpoints/checkpoint-2000/optimizer.pt b/checkpoints/checkpoint-2000/optimizer.pt index 2d923c6bc09d9c71f4cc2c99f3465aa012c7ad04..21a1049cecb4459a6c8c40b178c20c6893fcff96 100644 --- a/checkpoints/checkpoint-2000/optimizer.pt +++ b/checkpoints/checkpoint-2000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d2690494d465e208982d5145f8448519f3fe6f6f92a6f6306bf06891d5d5b7c -size 352735610 +oid sha256:a5931b4e56f412f06449e4f288e088bb30ed3fc4bf706aa5894a87df734d5d19 +size 1234355130 diff --git a/checkpoints/checkpoint-2000/rng_state.pth b/checkpoints/checkpoint-2000/rng_state.pth index dd8ddb39fedc80888f1dce2a1c5c3de475da0cdd..80fd5023087c2623223e30fb8292dc9f123191d0 100644 --- a/checkpoints/checkpoint-2000/rng_state.pth +++ b/checkpoints/checkpoint-2000/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa303010c68e2e269cd4bbfa77fb0543382a871099f691fddfc6344963ce0793 +oid sha256:ace39524c0ea246b461dfdc96e06376945ad766256fe88d13b43d6b4c7c3988e size 14244 diff --git a/checkpoints/checkpoint-2000/trainer_state.json b/checkpoints/checkpoint-2000/trainer_state.json index c45fc6dfe4bf342949609a58506e61e38416d6ea..f52571d7ca80355228877a6d9edd4436f3fc10d9 100644 --- a/checkpoints/checkpoint-2000/trainer_state.json +++ b/checkpoints/checkpoint-2000/trainer_state.json @@ -11,282 +11,282 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 }, { "epoch": 2.6699029126213594, - "grad_norm": 5.35228157043457, + "grad_norm": 5.231846809387207, "learning_rate": 4.555825242718447e-05, - "loss": 4.1612, + "loss": 4.0662, "step": 550 }, { "epoch": 2.912621359223301, - "grad_norm": 5.287161827087402, + "grad_norm": 5.506278038024902, "learning_rate": 4.51537216828479e-05, - "loss": 4.1322, + "loss": 4.0403, "step": 600 }, { "epoch": 3.1553398058252426, - "grad_norm": 5.2932209968566895, + "grad_norm": 5.410216331481934, "learning_rate": 4.4749190938511334e-05, - "loss": 3.9684, + "loss": 3.8375, "step": 650 }, { "epoch": 3.3980582524271843, - "grad_norm": 5.4360833168029785, + "grad_norm": 5.637699127197266, "learning_rate": 4.434466019417476e-05, - "loss": 3.8933, + "loss": 3.7473, "step": 700 }, { "epoch": 3.6407766990291264, - "grad_norm": 5.5595221519470215, + "grad_norm": 5.575273513793945, "learning_rate": 4.3940129449838194e-05, - "loss": 3.9306, + "loss": 3.7835, "step": 750 }, { "epoch": 3.883495145631068, - "grad_norm": 5.540086269378662, + "grad_norm": 5.737198352813721, "learning_rate": 4.353559870550162e-05, - "loss": 3.9317, + "loss": 3.7919, "step": 800 }, { "epoch": 4.12621359223301, - "grad_norm": 5.567137718200684, + "grad_norm": 5.645532608032227, "learning_rate": 4.313106796116505e-05, - "loss": 3.8135, + "loss": 3.6425, "step": 850 }, { "epoch": 4.368932038834951, - "grad_norm": 6.00346040725708, + "grad_norm": 6.308927059173584, "learning_rate": 4.272653721682848e-05, - "loss": 3.6924, + "loss": 3.4785, "step": 900 }, { "epoch": 4.611650485436893, - "grad_norm": 5.635782718658447, + "grad_norm": 5.863094329833984, "learning_rate": 4.232200647249191e-05, - "loss": 3.7304, + "loss": 3.5268, "step": 950 }, { "epoch": 4.854368932038835, - "grad_norm": 5.9545440673828125, + "grad_norm": 6.120258331298828, "learning_rate": 4.191747572815534e-05, - "loss": 3.7471, + "loss": 3.5296, "step": 1000 }, { "epoch": 5.097087378640777, - "grad_norm": 5.696166038513184, + "grad_norm": 6.231515884399414, "learning_rate": 4.1512944983818774e-05, - "loss": 3.6287, + "loss": 3.3802, "step": 1050 }, { "epoch": 5.339805825242719, - "grad_norm": 5.844620704650879, + "grad_norm": 6.211188316345215, "learning_rate": 4.11084142394822e-05, - "loss": 3.5299, + "loss": 3.2148, "step": 1100 }, { "epoch": 5.58252427184466, - "grad_norm": 6.373577117919922, + "grad_norm": 6.8676934242248535, "learning_rate": 4.0703883495145634e-05, - "loss": 3.5285, + "loss": 3.2111, "step": 1150 }, { "epoch": 5.825242718446602, - "grad_norm": 6.488651275634766, + "grad_norm": 6.851133346557617, "learning_rate": 4.029935275080906e-05, - "loss": 3.5789, + "loss": 3.2771, "step": 1200 }, { "epoch": 6.067961165048544, - "grad_norm": 6.170980930328369, + "grad_norm": 6.708765506744385, "learning_rate": 3.9894822006472494e-05, - "loss": 3.494, + "loss": 3.1587, "step": 1250 }, { "epoch": 6.310679611650485, - "grad_norm": 6.387782573699951, + "grad_norm": 7.136553764343262, "learning_rate": 3.949029126213593e-05, - "loss": 3.353, + "loss": 2.9192, "step": 1300 }, { "epoch": 6.553398058252427, - "grad_norm": 6.718175888061523, + "grad_norm": 7.370823383331299, "learning_rate": 3.9085760517799354e-05, - "loss": 3.3521, + "loss": 2.922, "step": 1350 }, { "epoch": 6.796116504854369, - "grad_norm": 6.7870025634765625, + "grad_norm": 7.630361557006836, "learning_rate": 3.868122977346279e-05, - "loss": 3.3658, + "loss": 2.9417, "step": 1400 }, { "epoch": 7.038834951456311, - "grad_norm": 6.513721466064453, + "grad_norm": 7.765511989593506, "learning_rate": 3.827669902912622e-05, - "loss": 3.3301, + "loss": 2.8669, "step": 1450 }, { "epoch": 7.281553398058253, - "grad_norm": 7.0680036544799805, + "grad_norm": 7.85439920425415, "learning_rate": 3.787216828478965e-05, - "loss": 3.1289, + "loss": 2.5412, "step": 1500 }, { "epoch": 7.524271844660194, - "grad_norm": 7.019896507263184, + "grad_norm": 8.064071655273438, "learning_rate": 3.746763754045307e-05, - "loss": 3.1878, + "loss": 2.6131, "step": 1550 }, { "epoch": 7.766990291262136, - "grad_norm": 7.3232197761535645, + "grad_norm": 8.692522048950195, "learning_rate": 3.7063106796116507e-05, - "loss": 3.2006, + "loss": 2.6356, "step": 1600 }, { "epoch": 8.009708737864077, - "grad_norm": 7.003912925720215, + "grad_norm": 8.012410163879395, "learning_rate": 3.665857605177993e-05, - "loss": 3.1737, + "loss": 2.599, "step": 1650 }, { "epoch": 8.25242718446602, - "grad_norm": 7.592433929443359, + "grad_norm": 8.6799898147583, "learning_rate": 3.6254045307443366e-05, - "loss": 2.9258, + "loss": 2.1992, "step": 1700 }, { "epoch": 8.495145631067961, - "grad_norm": 7.629278182983398, + "grad_norm": 8.963507652282715, "learning_rate": 3.584951456310679e-05, - "loss": 2.9999, + "loss": 2.2701, "step": 1750 }, { "epoch": 8.737864077669903, - "grad_norm": 8.137636184692383, + "grad_norm": 9.833822250366211, "learning_rate": 3.5444983818770226e-05, - "loss": 3.0155, + "loss": 2.2837, "step": 1800 }, { "epoch": 8.980582524271846, - "grad_norm": 7.967227935791016, + "grad_norm": 9.47242259979248, "learning_rate": 3.504045307443366e-05, - "loss": 2.9945, + "loss": 2.2854, "step": 1850 }, { "epoch": 9.223300970873787, - "grad_norm": 7.9910078048706055, + "grad_norm": 8.856402397155762, "learning_rate": 3.4635922330097086e-05, - "loss": 2.7456, + "loss": 1.8862, "step": 1900 }, { "epoch": 9.466019417475728, - "grad_norm": 8.552720069885254, + "grad_norm": 9.948341369628906, "learning_rate": 3.423139158576052e-05, - "loss": 2.8071, + "loss": 1.922, "step": 1950 }, { "epoch": 9.70873786407767, - "grad_norm": 8.336930274963379, + "grad_norm": 10.28939151763916, "learning_rate": 3.382686084142395e-05, - "loss": 2.8117, + "loss": 1.9622, "step": 2000 } ], @@ -307,7 +307,7 @@ "attributes": {} } }, - "total_flos": 261296750592000.0, + "total_flos": 928706985984000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-2000/training_args.bin b/checkpoints/checkpoint-2000/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-2000/training_args.bin +++ b/checkpoints/checkpoint-2000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/checkpoints/checkpoint-2500/config.json b/checkpoints/checkpoint-2500/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-2500/config.json +++ b/checkpoints/checkpoint-2500/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-2500/model.safetensors b/checkpoints/checkpoint-2500/model.safetensors index f71077b9ce80674150a8528975521855bc0acdc1..eeb22f340ce29bc12f5096a3b60eef2ae7761ab0 100644 --- a/checkpoints/checkpoint-2500/model.safetensors +++ b/checkpoints/checkpoint-2500/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1cf1426568d31687412e315e1e4bfca12c7d5f958e2e1e9165ffa0db03a7554b -size 176343496 +oid sha256:2a16b607fac318ea7e4614190d1d857c6fc3ed432f70935eac08139817139d27 +size 617130824 diff --git a/checkpoints/checkpoint-2500/optimizer.pt b/checkpoints/checkpoint-2500/optimizer.pt index fd9070458b5ac7fdbf3e47ce322a3ca3ace0f8d2..03cbf86dcb34d5e476a21e34f3aa3e678ac4bffe 100644 --- a/checkpoints/checkpoint-2500/optimizer.pt +++ b/checkpoints/checkpoint-2500/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d131c0a3b1896514cda171b1698f8ad491e48fe541af53ae5dc2ed7bff06444 -size 352735610 +oid sha256:1df103a8189689ad21192707d0a5df531adec3994d42b7d06db9ffe8efac9c0b +size 1234355130 diff --git a/checkpoints/checkpoint-2500/rng_state.pth b/checkpoints/checkpoint-2500/rng_state.pth index b737ce28d4592687d6c60bfdae5f17bc2d76764b..60f5c41d3bbb106b17bd85651d647153b83ed93b 100644 --- a/checkpoints/checkpoint-2500/rng_state.pth +++ b/checkpoints/checkpoint-2500/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a57a7d4589802cdb66d0be994455c3851136fb7cbe9cea2866d0b58af11c743 +oid sha256:6220bdac54f7d614e1573ac8dd4440b0a8ffb03b06787f2282bd064aa9d0431e size 14244 diff --git a/checkpoints/checkpoint-2500/trainer_state.json b/checkpoints/checkpoint-2500/trainer_state.json index 19b8e2823f88d397b526052db1b3fe28bdaef548..375b76d90fc025cfeb1202bff0032639cb5ea986 100644 --- a/checkpoints/checkpoint-2500/trainer_state.json +++ b/checkpoints/checkpoint-2500/trainer_state.json @@ -11,352 +11,352 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 }, { "epoch": 2.6699029126213594, - "grad_norm": 5.35228157043457, + "grad_norm": 5.231846809387207, "learning_rate": 4.555825242718447e-05, - "loss": 4.1612, + "loss": 4.0662, "step": 550 }, { "epoch": 2.912621359223301, - "grad_norm": 5.287161827087402, + "grad_norm": 5.506278038024902, "learning_rate": 4.51537216828479e-05, - "loss": 4.1322, + "loss": 4.0403, "step": 600 }, { "epoch": 3.1553398058252426, - "grad_norm": 5.2932209968566895, + "grad_norm": 5.410216331481934, "learning_rate": 4.4749190938511334e-05, - "loss": 3.9684, + "loss": 3.8375, "step": 650 }, { "epoch": 3.3980582524271843, - "grad_norm": 5.4360833168029785, + "grad_norm": 5.637699127197266, "learning_rate": 4.434466019417476e-05, - "loss": 3.8933, + "loss": 3.7473, "step": 700 }, { "epoch": 3.6407766990291264, - "grad_norm": 5.5595221519470215, + "grad_norm": 5.575273513793945, "learning_rate": 4.3940129449838194e-05, - "loss": 3.9306, + "loss": 3.7835, "step": 750 }, { "epoch": 3.883495145631068, - "grad_norm": 5.540086269378662, + "grad_norm": 5.737198352813721, "learning_rate": 4.353559870550162e-05, - "loss": 3.9317, + "loss": 3.7919, "step": 800 }, { "epoch": 4.12621359223301, - "grad_norm": 5.567137718200684, + "grad_norm": 5.645532608032227, "learning_rate": 4.313106796116505e-05, - "loss": 3.8135, + "loss": 3.6425, "step": 850 }, { "epoch": 4.368932038834951, - "grad_norm": 6.00346040725708, + "grad_norm": 6.308927059173584, "learning_rate": 4.272653721682848e-05, - "loss": 3.6924, + "loss": 3.4785, "step": 900 }, { "epoch": 4.611650485436893, - "grad_norm": 5.635782718658447, + "grad_norm": 5.863094329833984, "learning_rate": 4.232200647249191e-05, - "loss": 3.7304, + "loss": 3.5268, "step": 950 }, { "epoch": 4.854368932038835, - "grad_norm": 5.9545440673828125, + "grad_norm": 6.120258331298828, "learning_rate": 4.191747572815534e-05, - "loss": 3.7471, + "loss": 3.5296, "step": 1000 }, { "epoch": 5.097087378640777, - "grad_norm": 5.696166038513184, + "grad_norm": 6.231515884399414, "learning_rate": 4.1512944983818774e-05, - "loss": 3.6287, + "loss": 3.3802, "step": 1050 }, { "epoch": 5.339805825242719, - "grad_norm": 5.844620704650879, + "grad_norm": 6.211188316345215, "learning_rate": 4.11084142394822e-05, - "loss": 3.5299, + "loss": 3.2148, "step": 1100 }, { "epoch": 5.58252427184466, - "grad_norm": 6.373577117919922, + "grad_norm": 6.8676934242248535, "learning_rate": 4.0703883495145634e-05, - "loss": 3.5285, + "loss": 3.2111, "step": 1150 }, { "epoch": 5.825242718446602, - "grad_norm": 6.488651275634766, + "grad_norm": 6.851133346557617, "learning_rate": 4.029935275080906e-05, - "loss": 3.5789, + "loss": 3.2771, "step": 1200 }, { "epoch": 6.067961165048544, - "grad_norm": 6.170980930328369, + "grad_norm": 6.708765506744385, "learning_rate": 3.9894822006472494e-05, - "loss": 3.494, + "loss": 3.1587, "step": 1250 }, { "epoch": 6.310679611650485, - "grad_norm": 6.387782573699951, + "grad_norm": 7.136553764343262, "learning_rate": 3.949029126213593e-05, - "loss": 3.353, + "loss": 2.9192, "step": 1300 }, { "epoch": 6.553398058252427, - "grad_norm": 6.718175888061523, + "grad_norm": 7.370823383331299, "learning_rate": 3.9085760517799354e-05, - "loss": 3.3521, + "loss": 2.922, "step": 1350 }, { "epoch": 6.796116504854369, - "grad_norm": 6.7870025634765625, + "grad_norm": 7.630361557006836, "learning_rate": 3.868122977346279e-05, - "loss": 3.3658, + "loss": 2.9417, "step": 1400 }, { "epoch": 7.038834951456311, - "grad_norm": 6.513721466064453, + "grad_norm": 7.765511989593506, "learning_rate": 3.827669902912622e-05, - "loss": 3.3301, + "loss": 2.8669, "step": 1450 }, { "epoch": 7.281553398058253, - "grad_norm": 7.0680036544799805, + "grad_norm": 7.85439920425415, "learning_rate": 3.787216828478965e-05, - "loss": 3.1289, + "loss": 2.5412, "step": 1500 }, { "epoch": 7.524271844660194, - "grad_norm": 7.019896507263184, + "grad_norm": 8.064071655273438, "learning_rate": 3.746763754045307e-05, - "loss": 3.1878, + "loss": 2.6131, "step": 1550 }, { "epoch": 7.766990291262136, - "grad_norm": 7.3232197761535645, + "grad_norm": 8.692522048950195, "learning_rate": 3.7063106796116507e-05, - "loss": 3.2006, + "loss": 2.6356, "step": 1600 }, { "epoch": 8.009708737864077, - "grad_norm": 7.003912925720215, + "grad_norm": 8.012410163879395, "learning_rate": 3.665857605177993e-05, - "loss": 3.1737, + "loss": 2.599, "step": 1650 }, { "epoch": 8.25242718446602, - "grad_norm": 7.592433929443359, + "grad_norm": 8.6799898147583, "learning_rate": 3.6254045307443366e-05, - "loss": 2.9258, + "loss": 2.1992, "step": 1700 }, { "epoch": 8.495145631067961, - "grad_norm": 7.629278182983398, + "grad_norm": 8.963507652282715, "learning_rate": 3.584951456310679e-05, - "loss": 2.9999, + "loss": 2.2701, "step": 1750 }, { "epoch": 8.737864077669903, - "grad_norm": 8.137636184692383, + "grad_norm": 9.833822250366211, "learning_rate": 3.5444983818770226e-05, - "loss": 3.0155, + "loss": 2.2837, "step": 1800 }, { "epoch": 8.980582524271846, - "grad_norm": 7.967227935791016, + "grad_norm": 9.47242259979248, "learning_rate": 3.504045307443366e-05, - "loss": 2.9945, + "loss": 2.2854, "step": 1850 }, { "epoch": 9.223300970873787, - "grad_norm": 7.9910078048706055, + "grad_norm": 8.856402397155762, "learning_rate": 3.4635922330097086e-05, - "loss": 2.7456, + "loss": 1.8862, "step": 1900 }, { "epoch": 9.466019417475728, - "grad_norm": 8.552720069885254, + "grad_norm": 9.948341369628906, "learning_rate": 3.423139158576052e-05, - "loss": 2.8071, + "loss": 1.922, "step": 1950 }, { "epoch": 9.70873786407767, - "grad_norm": 8.336930274963379, + "grad_norm": 10.28939151763916, "learning_rate": 3.382686084142395e-05, - "loss": 2.8117, + "loss": 1.9622, "step": 2000 }, { "epoch": 9.951456310679612, - "grad_norm": 8.636338233947754, + "grad_norm": 10.114766120910645, "learning_rate": 3.342233009708738e-05, - "loss": 2.8197, + "loss": 1.9741, "step": 2050 }, { "epoch": 10.194174757281553, - "grad_norm": 7.955165863037109, + "grad_norm": 9.489270210266113, "learning_rate": 3.301779935275081e-05, - "loss": 2.6245, + "loss": 1.6509, "step": 2100 }, { "epoch": 10.436893203883495, - "grad_norm": 8.679149627685547, + "grad_norm": 10.16482162475586, "learning_rate": 3.2613268608414246e-05, - "loss": 2.565, + "loss": 1.5883, "step": 2150 }, { "epoch": 10.679611650485437, - "grad_norm": 9.150839805603027, + "grad_norm": 11.778059005737305, "learning_rate": 3.220873786407767e-05, - "loss": 2.6258, + "loss": 1.6234, "step": 2200 }, { "epoch": 10.922330097087379, - "grad_norm": 9.2735013961792, + "grad_norm": 10.798454284667969, "learning_rate": 3.1804207119741106e-05, - "loss": 2.6439, + "loss": 1.667, "step": 2250 }, { "epoch": 11.16504854368932, - "grad_norm": 8.620939254760742, + "grad_norm": 9.751585006713867, "learning_rate": 3.139967637540453e-05, - "loss": 2.4371, + "loss": 1.3785, "step": 2300 }, { "epoch": 11.407766990291263, - "grad_norm": 9.722040176391602, + "grad_norm": 10.266364097595215, "learning_rate": 3.099514563106796e-05, - "loss": 2.4074, + "loss": 1.2956, "step": 2350 }, { "epoch": 11.650485436893204, - "grad_norm": 9.200730323791504, + "grad_norm": 10.660856246948242, "learning_rate": 3.059061488673139e-05, - "loss": 2.4307, + "loss": 1.3321, "step": 2400 }, { "epoch": 11.893203883495145, - "grad_norm": 9.692273139953613, + "grad_norm": 10.896187782287598, "learning_rate": 3.0186084142394822e-05, - "loss": 2.424, + "loss": 1.3533, "step": 2450 }, { "epoch": 12.135922330097088, - "grad_norm": 9.822249412536621, + "grad_norm": 10.952502250671387, "learning_rate": 2.9781553398058252e-05, - "loss": 2.2862, + "loss": 1.1486, "step": 2500 } ], @@ -377,7 +377,7 @@ "attributes": {} } }, - "total_flos": 326620938240000.0, + "total_flos": 1160883732480000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-2500/training_args.bin b/checkpoints/checkpoint-2500/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-2500/training_args.bin +++ b/checkpoints/checkpoint-2500/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/checkpoints/checkpoint-3000/config.json b/checkpoints/checkpoint-3000/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-3000/config.json +++ b/checkpoints/checkpoint-3000/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-3000/model.safetensors b/checkpoints/checkpoint-3000/model.safetensors index eeaa2e7ac692332e4cd7eeee9ad5582b2c973792..609bf556c48d83c5b91585c409ac2020e9a9517f 100644 --- a/checkpoints/checkpoint-3000/model.safetensors +++ b/checkpoints/checkpoint-3000/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee9950fb5c86bae073ca52771961c408c692f4811dffc7d19b51bc4ce3680cc7 -size 176343496 +oid sha256:dca0c20fbe7c227e25d34c3ae029f189aba9c47fb81a6c624663bff3d5ea0555 +size 617130824 diff --git a/checkpoints/checkpoint-3000/optimizer.pt b/checkpoints/checkpoint-3000/optimizer.pt index c626ed3e6d314185bb74d08789cf5542804b35a4..0e174b910bda0eb607d4ae7c2519063b787de510 100644 --- a/checkpoints/checkpoint-3000/optimizer.pt +++ b/checkpoints/checkpoint-3000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e038dba515e8f39fe285658df720ade045fb28c8083d1b685733be9fc873618d -size 352735610 +oid sha256:ac34bfb31dd330051c1b732092c16075d893324bb0adbf25c9943f622855be0d +size 1234355130 diff --git a/checkpoints/checkpoint-3000/rng_state.pth b/checkpoints/checkpoint-3000/rng_state.pth index 49c4552c0300171d5a5e73c76dc17684de3a549d..421f37d3fb25459d21862a3786accbdcbb9ec722 100644 --- a/checkpoints/checkpoint-3000/rng_state.pth +++ b/checkpoints/checkpoint-3000/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c44c900bffc69f1c191b57d8140bb20f6c6d64c300841c9c07c540a470f52495 +oid sha256:871442522375fc4d6ba225860b83f783120e0bc1b3bc165d21ae61e30b77d444 size 14244 diff --git a/checkpoints/checkpoint-3000/trainer_state.json b/checkpoints/checkpoint-3000/trainer_state.json index 30d3d92868330cd843852b62ca7acc6595e42d9f..bac6fc3e433ad201300de6e379bcfc10cf31941c 100644 --- a/checkpoints/checkpoint-3000/trainer_state.json +++ b/checkpoints/checkpoint-3000/trainer_state.json @@ -11,422 +11,422 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 }, { "epoch": 2.6699029126213594, - "grad_norm": 5.35228157043457, + "grad_norm": 5.231846809387207, "learning_rate": 4.555825242718447e-05, - "loss": 4.1612, + "loss": 4.0662, "step": 550 }, { "epoch": 2.912621359223301, - "grad_norm": 5.287161827087402, + "grad_norm": 5.506278038024902, "learning_rate": 4.51537216828479e-05, - "loss": 4.1322, + "loss": 4.0403, "step": 600 }, { "epoch": 3.1553398058252426, - "grad_norm": 5.2932209968566895, + "grad_norm": 5.410216331481934, "learning_rate": 4.4749190938511334e-05, - "loss": 3.9684, + "loss": 3.8375, "step": 650 }, { "epoch": 3.3980582524271843, - "grad_norm": 5.4360833168029785, + "grad_norm": 5.637699127197266, "learning_rate": 4.434466019417476e-05, - "loss": 3.8933, + "loss": 3.7473, "step": 700 }, { "epoch": 3.6407766990291264, - "grad_norm": 5.5595221519470215, + "grad_norm": 5.575273513793945, "learning_rate": 4.3940129449838194e-05, - "loss": 3.9306, + "loss": 3.7835, "step": 750 }, { "epoch": 3.883495145631068, - "grad_norm": 5.540086269378662, + "grad_norm": 5.737198352813721, "learning_rate": 4.353559870550162e-05, - "loss": 3.9317, + "loss": 3.7919, "step": 800 }, { "epoch": 4.12621359223301, - "grad_norm": 5.567137718200684, + "grad_norm": 5.645532608032227, "learning_rate": 4.313106796116505e-05, - "loss": 3.8135, + "loss": 3.6425, "step": 850 }, { "epoch": 4.368932038834951, - "grad_norm": 6.00346040725708, + "grad_norm": 6.308927059173584, "learning_rate": 4.272653721682848e-05, - "loss": 3.6924, + "loss": 3.4785, "step": 900 }, { "epoch": 4.611650485436893, - "grad_norm": 5.635782718658447, + "grad_norm": 5.863094329833984, "learning_rate": 4.232200647249191e-05, - "loss": 3.7304, + "loss": 3.5268, "step": 950 }, { "epoch": 4.854368932038835, - "grad_norm": 5.9545440673828125, + "grad_norm": 6.120258331298828, "learning_rate": 4.191747572815534e-05, - "loss": 3.7471, + "loss": 3.5296, "step": 1000 }, { "epoch": 5.097087378640777, - "grad_norm": 5.696166038513184, + "grad_norm": 6.231515884399414, "learning_rate": 4.1512944983818774e-05, - "loss": 3.6287, + "loss": 3.3802, "step": 1050 }, { "epoch": 5.339805825242719, - "grad_norm": 5.844620704650879, + "grad_norm": 6.211188316345215, "learning_rate": 4.11084142394822e-05, - "loss": 3.5299, + "loss": 3.2148, "step": 1100 }, { "epoch": 5.58252427184466, - "grad_norm": 6.373577117919922, + "grad_norm": 6.8676934242248535, "learning_rate": 4.0703883495145634e-05, - "loss": 3.5285, + "loss": 3.2111, "step": 1150 }, { "epoch": 5.825242718446602, - "grad_norm": 6.488651275634766, + "grad_norm": 6.851133346557617, "learning_rate": 4.029935275080906e-05, - "loss": 3.5789, + "loss": 3.2771, "step": 1200 }, { "epoch": 6.067961165048544, - "grad_norm": 6.170980930328369, + "grad_norm": 6.708765506744385, "learning_rate": 3.9894822006472494e-05, - "loss": 3.494, + "loss": 3.1587, "step": 1250 }, { "epoch": 6.310679611650485, - "grad_norm": 6.387782573699951, + "grad_norm": 7.136553764343262, "learning_rate": 3.949029126213593e-05, - "loss": 3.353, + "loss": 2.9192, "step": 1300 }, { "epoch": 6.553398058252427, - "grad_norm": 6.718175888061523, + "grad_norm": 7.370823383331299, "learning_rate": 3.9085760517799354e-05, - "loss": 3.3521, + "loss": 2.922, "step": 1350 }, { "epoch": 6.796116504854369, - "grad_norm": 6.7870025634765625, + "grad_norm": 7.630361557006836, "learning_rate": 3.868122977346279e-05, - "loss": 3.3658, + "loss": 2.9417, "step": 1400 }, { "epoch": 7.038834951456311, - "grad_norm": 6.513721466064453, + "grad_norm": 7.765511989593506, "learning_rate": 3.827669902912622e-05, - "loss": 3.3301, + "loss": 2.8669, "step": 1450 }, { "epoch": 7.281553398058253, - "grad_norm": 7.0680036544799805, + "grad_norm": 7.85439920425415, "learning_rate": 3.787216828478965e-05, - "loss": 3.1289, + "loss": 2.5412, "step": 1500 }, { "epoch": 7.524271844660194, - "grad_norm": 7.019896507263184, + "grad_norm": 8.064071655273438, "learning_rate": 3.746763754045307e-05, - "loss": 3.1878, + "loss": 2.6131, "step": 1550 }, { "epoch": 7.766990291262136, - "grad_norm": 7.3232197761535645, + "grad_norm": 8.692522048950195, "learning_rate": 3.7063106796116507e-05, - "loss": 3.2006, + "loss": 2.6356, "step": 1600 }, { "epoch": 8.009708737864077, - "grad_norm": 7.003912925720215, + "grad_norm": 8.012410163879395, "learning_rate": 3.665857605177993e-05, - "loss": 3.1737, + "loss": 2.599, "step": 1650 }, { "epoch": 8.25242718446602, - "grad_norm": 7.592433929443359, + "grad_norm": 8.6799898147583, "learning_rate": 3.6254045307443366e-05, - "loss": 2.9258, + "loss": 2.1992, "step": 1700 }, { "epoch": 8.495145631067961, - "grad_norm": 7.629278182983398, + "grad_norm": 8.963507652282715, "learning_rate": 3.584951456310679e-05, - "loss": 2.9999, + "loss": 2.2701, "step": 1750 }, { "epoch": 8.737864077669903, - "grad_norm": 8.137636184692383, + "grad_norm": 9.833822250366211, "learning_rate": 3.5444983818770226e-05, - "loss": 3.0155, + "loss": 2.2837, "step": 1800 }, { "epoch": 8.980582524271846, - "grad_norm": 7.967227935791016, + "grad_norm": 9.47242259979248, "learning_rate": 3.504045307443366e-05, - "loss": 2.9945, + "loss": 2.2854, "step": 1850 }, { "epoch": 9.223300970873787, - "grad_norm": 7.9910078048706055, + "grad_norm": 8.856402397155762, "learning_rate": 3.4635922330097086e-05, - "loss": 2.7456, + "loss": 1.8862, "step": 1900 }, { "epoch": 9.466019417475728, - "grad_norm": 8.552720069885254, + "grad_norm": 9.948341369628906, "learning_rate": 3.423139158576052e-05, - "loss": 2.8071, + "loss": 1.922, "step": 1950 }, { "epoch": 9.70873786407767, - "grad_norm": 8.336930274963379, + "grad_norm": 10.28939151763916, "learning_rate": 3.382686084142395e-05, - "loss": 2.8117, + "loss": 1.9622, "step": 2000 }, { "epoch": 9.951456310679612, - "grad_norm": 8.636338233947754, + "grad_norm": 10.114766120910645, "learning_rate": 3.342233009708738e-05, - "loss": 2.8197, + "loss": 1.9741, "step": 2050 }, { "epoch": 10.194174757281553, - "grad_norm": 7.955165863037109, + "grad_norm": 9.489270210266113, "learning_rate": 3.301779935275081e-05, - "loss": 2.6245, + "loss": 1.6509, "step": 2100 }, { "epoch": 10.436893203883495, - "grad_norm": 8.679149627685547, + "grad_norm": 10.16482162475586, "learning_rate": 3.2613268608414246e-05, - "loss": 2.565, + "loss": 1.5883, "step": 2150 }, { "epoch": 10.679611650485437, - "grad_norm": 9.150839805603027, + "grad_norm": 11.778059005737305, "learning_rate": 3.220873786407767e-05, - "loss": 2.6258, + "loss": 1.6234, "step": 2200 }, { "epoch": 10.922330097087379, - "grad_norm": 9.2735013961792, + "grad_norm": 10.798454284667969, "learning_rate": 3.1804207119741106e-05, - "loss": 2.6439, + "loss": 1.667, "step": 2250 }, { "epoch": 11.16504854368932, - "grad_norm": 8.620939254760742, + "grad_norm": 9.751585006713867, "learning_rate": 3.139967637540453e-05, - "loss": 2.4371, + "loss": 1.3785, "step": 2300 }, { "epoch": 11.407766990291263, - "grad_norm": 9.722040176391602, + "grad_norm": 10.266364097595215, "learning_rate": 3.099514563106796e-05, - "loss": 2.4074, + "loss": 1.2956, "step": 2350 }, { "epoch": 11.650485436893204, - "grad_norm": 9.200730323791504, + "grad_norm": 10.660856246948242, "learning_rate": 3.059061488673139e-05, - "loss": 2.4307, + "loss": 1.3321, "step": 2400 }, { "epoch": 11.893203883495145, - "grad_norm": 9.692273139953613, + "grad_norm": 10.896187782287598, "learning_rate": 3.0186084142394822e-05, - "loss": 2.424, + "loss": 1.3533, "step": 2450 }, { "epoch": 12.135922330097088, - "grad_norm": 9.822249412536621, + "grad_norm": 10.952502250671387, "learning_rate": 2.9781553398058252e-05, - "loss": 2.2862, + "loss": 1.1486, "step": 2500 }, { "epoch": 12.37864077669903, - "grad_norm": 9.919060707092285, + "grad_norm": 10.766463279724121, "learning_rate": 2.9377022653721686e-05, - "loss": 2.1913, + "loss": 1.0227, "step": 2550 }, { "epoch": 12.62135922330097, - "grad_norm": 9.181950569152832, + "grad_norm": 9.385764122009277, "learning_rate": 2.8972491909385112e-05, - "loss": 2.241, + "loss": 1.0651, "step": 2600 }, { "epoch": 12.864077669902912, - "grad_norm": 10.82077407836914, + "grad_norm": 11.573925018310547, "learning_rate": 2.8567961165048546e-05, - "loss": 2.2653, + "loss": 1.1013, "step": 2650 }, { "epoch": 13.106796116504855, - "grad_norm": 10.365830421447754, + "grad_norm": 10.138091087341309, "learning_rate": 2.816343042071198e-05, - "loss": 2.1352, + "loss": 0.9545, "step": 2700 }, { "epoch": 13.349514563106796, - "grad_norm": 10.073640823364258, + "grad_norm": 9.867119789123535, "learning_rate": 2.7758899676375405e-05, - "loss": 2.0227, + "loss": 0.7892, "step": 2750 }, { "epoch": 13.592233009708737, - "grad_norm": 10.520242691040039, + "grad_norm": 10.443971633911133, "learning_rate": 2.735436893203884e-05, - "loss": 2.0579, + "loss": 0.8432, "step": 2800 }, { "epoch": 13.83495145631068, - "grad_norm": 11.176478385925293, + "grad_norm": 10.633298873901367, "learning_rate": 2.6949838187702265e-05, - "loss": 2.0934, + "loss": 0.8543, "step": 2850 }, { "epoch": 14.077669902912621, - "grad_norm": 9.541868209838867, + "grad_norm": 8.640830039978027, "learning_rate": 2.6545307443365695e-05, - "loss": 2.0118, + "loss": 0.7751, "step": 2900 }, { "epoch": 14.320388349514563, - "grad_norm": 10.162981986999512, + "grad_norm": 10.461247444152832, "learning_rate": 2.614077669902913e-05, - "loss": 1.8624, + "loss": 0.6074, "step": 2950 }, { "epoch": 14.563106796116505, - "grad_norm": 10.51291275024414, + "grad_norm": 10.757479667663574, "learning_rate": 2.5736245954692555e-05, - "loss": 1.8979, + "loss": 0.6354, "step": 3000 } ], @@ -447,7 +447,7 @@ "attributes": {} } }, - "total_flos": 391945125888000.0, + "total_flos": 1393060478976000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-3000/training_args.bin b/checkpoints/checkpoint-3000/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-3000/training_args.bin +++ b/checkpoints/checkpoint-3000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/checkpoints/checkpoint-3500/config.json b/checkpoints/checkpoint-3500/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-3500/config.json +++ b/checkpoints/checkpoint-3500/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-3500/model.safetensors b/checkpoints/checkpoint-3500/model.safetensors index 9a354fcef3d342ed3d3374594c7d5df33c87fa5f..580b0af0e2a332d77983b28e5031e2812e0c100f 100644 --- a/checkpoints/checkpoint-3500/model.safetensors +++ b/checkpoints/checkpoint-3500/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c86a085cb6768ba7ee32df0f83847ca1776cf0b157c33a638d029df90f359293 -size 176343496 +oid sha256:ee4eddcf2434fc10bcb84be31fad14eaf9cb56d0657a2676caebdc4fa8006954 +size 617130824 diff --git a/checkpoints/checkpoint-3500/optimizer.pt b/checkpoints/checkpoint-3500/optimizer.pt index 21e5272f0d1c5ee85b3f2a9b2e58c421df718ea7..23599235aa9fa249cd7dcd109fa1aee3678e69c7 100644 --- a/checkpoints/checkpoint-3500/optimizer.pt +++ b/checkpoints/checkpoint-3500/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41d7bf321a4eb94e1fe13d18a0e7d4c754699c0e788055516e6543148a689329 -size 352735610 +oid sha256:f0146bc12519d4396fc95dbc8a2c0e2b1eba55de6a678f2ca3f1bb831c8111e6 +size 1234355130 diff --git a/checkpoints/checkpoint-3500/rng_state.pth b/checkpoints/checkpoint-3500/rng_state.pth index 01a728d5ec4423b27f95954b769d08a88fb5a14a..193aa75ab2143187bd98fc10ef19b359e207026e 100644 --- a/checkpoints/checkpoint-3500/rng_state.pth +++ b/checkpoints/checkpoint-3500/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5d7ec8cb6d519ee4314413f301a6d5a59c0c427b2ba96fbc7ceba752e82cdf8 +oid sha256:98ff60b9771555670f092dfde608075c59197b1a93a27a330f270d03668fc64e size 14244 diff --git a/checkpoints/checkpoint-3500/trainer_state.json b/checkpoints/checkpoint-3500/trainer_state.json index 3a2452e21cba316c51b4168f87df0656e8a65ebf..0db101c206185684366f921caffc3ae6297284bb 100644 --- a/checkpoints/checkpoint-3500/trainer_state.json +++ b/checkpoints/checkpoint-3500/trainer_state.json @@ -11,492 +11,492 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 }, { "epoch": 2.6699029126213594, - "grad_norm": 5.35228157043457, + "grad_norm": 5.231846809387207, "learning_rate": 4.555825242718447e-05, - "loss": 4.1612, + "loss": 4.0662, "step": 550 }, { "epoch": 2.912621359223301, - "grad_norm": 5.287161827087402, + "grad_norm": 5.506278038024902, "learning_rate": 4.51537216828479e-05, - "loss": 4.1322, + "loss": 4.0403, "step": 600 }, { "epoch": 3.1553398058252426, - "grad_norm": 5.2932209968566895, + "grad_norm": 5.410216331481934, "learning_rate": 4.4749190938511334e-05, - "loss": 3.9684, + "loss": 3.8375, "step": 650 }, { "epoch": 3.3980582524271843, - "grad_norm": 5.4360833168029785, + "grad_norm": 5.637699127197266, "learning_rate": 4.434466019417476e-05, - "loss": 3.8933, + "loss": 3.7473, "step": 700 }, { "epoch": 3.6407766990291264, - "grad_norm": 5.5595221519470215, + "grad_norm": 5.575273513793945, "learning_rate": 4.3940129449838194e-05, - "loss": 3.9306, + "loss": 3.7835, "step": 750 }, { "epoch": 3.883495145631068, - "grad_norm": 5.540086269378662, + "grad_norm": 5.737198352813721, "learning_rate": 4.353559870550162e-05, - "loss": 3.9317, + "loss": 3.7919, "step": 800 }, { "epoch": 4.12621359223301, - "grad_norm": 5.567137718200684, + "grad_norm": 5.645532608032227, "learning_rate": 4.313106796116505e-05, - "loss": 3.8135, + "loss": 3.6425, "step": 850 }, { "epoch": 4.368932038834951, - "grad_norm": 6.00346040725708, + "grad_norm": 6.308927059173584, "learning_rate": 4.272653721682848e-05, - "loss": 3.6924, + "loss": 3.4785, "step": 900 }, { "epoch": 4.611650485436893, - "grad_norm": 5.635782718658447, + "grad_norm": 5.863094329833984, "learning_rate": 4.232200647249191e-05, - "loss": 3.7304, + "loss": 3.5268, "step": 950 }, { "epoch": 4.854368932038835, - "grad_norm": 5.9545440673828125, + "grad_norm": 6.120258331298828, "learning_rate": 4.191747572815534e-05, - "loss": 3.7471, + "loss": 3.5296, "step": 1000 }, { "epoch": 5.097087378640777, - "grad_norm": 5.696166038513184, + "grad_norm": 6.231515884399414, "learning_rate": 4.1512944983818774e-05, - "loss": 3.6287, + "loss": 3.3802, "step": 1050 }, { "epoch": 5.339805825242719, - "grad_norm": 5.844620704650879, + "grad_norm": 6.211188316345215, "learning_rate": 4.11084142394822e-05, - "loss": 3.5299, + "loss": 3.2148, "step": 1100 }, { "epoch": 5.58252427184466, - "grad_norm": 6.373577117919922, + "grad_norm": 6.8676934242248535, "learning_rate": 4.0703883495145634e-05, - "loss": 3.5285, + "loss": 3.2111, "step": 1150 }, { "epoch": 5.825242718446602, - "grad_norm": 6.488651275634766, + "grad_norm": 6.851133346557617, "learning_rate": 4.029935275080906e-05, - "loss": 3.5789, + "loss": 3.2771, "step": 1200 }, { "epoch": 6.067961165048544, - "grad_norm": 6.170980930328369, + "grad_norm": 6.708765506744385, "learning_rate": 3.9894822006472494e-05, - "loss": 3.494, + "loss": 3.1587, "step": 1250 }, { "epoch": 6.310679611650485, - "grad_norm": 6.387782573699951, + "grad_norm": 7.136553764343262, "learning_rate": 3.949029126213593e-05, - "loss": 3.353, + "loss": 2.9192, "step": 1300 }, { "epoch": 6.553398058252427, - "grad_norm": 6.718175888061523, + "grad_norm": 7.370823383331299, "learning_rate": 3.9085760517799354e-05, - "loss": 3.3521, + "loss": 2.922, "step": 1350 }, { "epoch": 6.796116504854369, - "grad_norm": 6.7870025634765625, + "grad_norm": 7.630361557006836, "learning_rate": 3.868122977346279e-05, - "loss": 3.3658, + "loss": 2.9417, "step": 1400 }, { "epoch": 7.038834951456311, - "grad_norm": 6.513721466064453, + "grad_norm": 7.765511989593506, "learning_rate": 3.827669902912622e-05, - "loss": 3.3301, + "loss": 2.8669, "step": 1450 }, { "epoch": 7.281553398058253, - "grad_norm": 7.0680036544799805, + "grad_norm": 7.85439920425415, "learning_rate": 3.787216828478965e-05, - "loss": 3.1289, + "loss": 2.5412, "step": 1500 }, { "epoch": 7.524271844660194, - "grad_norm": 7.019896507263184, + "grad_norm": 8.064071655273438, "learning_rate": 3.746763754045307e-05, - "loss": 3.1878, + "loss": 2.6131, "step": 1550 }, { "epoch": 7.766990291262136, - "grad_norm": 7.3232197761535645, + "grad_norm": 8.692522048950195, "learning_rate": 3.7063106796116507e-05, - "loss": 3.2006, + "loss": 2.6356, "step": 1600 }, { "epoch": 8.009708737864077, - "grad_norm": 7.003912925720215, + "grad_norm": 8.012410163879395, "learning_rate": 3.665857605177993e-05, - "loss": 3.1737, + "loss": 2.599, "step": 1650 }, { "epoch": 8.25242718446602, - "grad_norm": 7.592433929443359, + "grad_norm": 8.6799898147583, "learning_rate": 3.6254045307443366e-05, - "loss": 2.9258, + "loss": 2.1992, "step": 1700 }, { "epoch": 8.495145631067961, - "grad_norm": 7.629278182983398, + "grad_norm": 8.963507652282715, "learning_rate": 3.584951456310679e-05, - "loss": 2.9999, + "loss": 2.2701, "step": 1750 }, { "epoch": 8.737864077669903, - "grad_norm": 8.137636184692383, + "grad_norm": 9.833822250366211, "learning_rate": 3.5444983818770226e-05, - "loss": 3.0155, + "loss": 2.2837, "step": 1800 }, { "epoch": 8.980582524271846, - "grad_norm": 7.967227935791016, + "grad_norm": 9.47242259979248, "learning_rate": 3.504045307443366e-05, - "loss": 2.9945, + "loss": 2.2854, "step": 1850 }, { "epoch": 9.223300970873787, - "grad_norm": 7.9910078048706055, + "grad_norm": 8.856402397155762, "learning_rate": 3.4635922330097086e-05, - "loss": 2.7456, + "loss": 1.8862, "step": 1900 }, { "epoch": 9.466019417475728, - "grad_norm": 8.552720069885254, + "grad_norm": 9.948341369628906, "learning_rate": 3.423139158576052e-05, - "loss": 2.8071, + "loss": 1.922, "step": 1950 }, { "epoch": 9.70873786407767, - "grad_norm": 8.336930274963379, + "grad_norm": 10.28939151763916, "learning_rate": 3.382686084142395e-05, - "loss": 2.8117, + "loss": 1.9622, "step": 2000 }, { "epoch": 9.951456310679612, - "grad_norm": 8.636338233947754, + "grad_norm": 10.114766120910645, "learning_rate": 3.342233009708738e-05, - "loss": 2.8197, + "loss": 1.9741, "step": 2050 }, { "epoch": 10.194174757281553, - "grad_norm": 7.955165863037109, + "grad_norm": 9.489270210266113, "learning_rate": 3.301779935275081e-05, - "loss": 2.6245, + "loss": 1.6509, "step": 2100 }, { "epoch": 10.436893203883495, - "grad_norm": 8.679149627685547, + "grad_norm": 10.16482162475586, "learning_rate": 3.2613268608414246e-05, - "loss": 2.565, + "loss": 1.5883, "step": 2150 }, { "epoch": 10.679611650485437, - "grad_norm": 9.150839805603027, + "grad_norm": 11.778059005737305, "learning_rate": 3.220873786407767e-05, - "loss": 2.6258, + "loss": 1.6234, "step": 2200 }, { "epoch": 10.922330097087379, - "grad_norm": 9.2735013961792, + "grad_norm": 10.798454284667969, "learning_rate": 3.1804207119741106e-05, - "loss": 2.6439, + "loss": 1.667, "step": 2250 }, { "epoch": 11.16504854368932, - "grad_norm": 8.620939254760742, + "grad_norm": 9.751585006713867, "learning_rate": 3.139967637540453e-05, - "loss": 2.4371, + "loss": 1.3785, "step": 2300 }, { "epoch": 11.407766990291263, - "grad_norm": 9.722040176391602, + "grad_norm": 10.266364097595215, "learning_rate": 3.099514563106796e-05, - "loss": 2.4074, + "loss": 1.2956, "step": 2350 }, { "epoch": 11.650485436893204, - "grad_norm": 9.200730323791504, + "grad_norm": 10.660856246948242, "learning_rate": 3.059061488673139e-05, - "loss": 2.4307, + "loss": 1.3321, "step": 2400 }, { "epoch": 11.893203883495145, - "grad_norm": 9.692273139953613, + "grad_norm": 10.896187782287598, "learning_rate": 3.0186084142394822e-05, - "loss": 2.424, + "loss": 1.3533, "step": 2450 }, { "epoch": 12.135922330097088, - "grad_norm": 9.822249412536621, + "grad_norm": 10.952502250671387, "learning_rate": 2.9781553398058252e-05, - "loss": 2.2862, + "loss": 1.1486, "step": 2500 }, { "epoch": 12.37864077669903, - "grad_norm": 9.919060707092285, + "grad_norm": 10.766463279724121, "learning_rate": 2.9377022653721686e-05, - "loss": 2.1913, + "loss": 1.0227, "step": 2550 }, { "epoch": 12.62135922330097, - "grad_norm": 9.181950569152832, + "grad_norm": 9.385764122009277, "learning_rate": 2.8972491909385112e-05, - "loss": 2.241, + "loss": 1.0651, "step": 2600 }, { "epoch": 12.864077669902912, - "grad_norm": 10.82077407836914, + "grad_norm": 11.573925018310547, "learning_rate": 2.8567961165048546e-05, - "loss": 2.2653, + "loss": 1.1013, "step": 2650 }, { "epoch": 13.106796116504855, - "grad_norm": 10.365830421447754, + "grad_norm": 10.138091087341309, "learning_rate": 2.816343042071198e-05, - "loss": 2.1352, + "loss": 0.9545, "step": 2700 }, { "epoch": 13.349514563106796, - "grad_norm": 10.073640823364258, + "grad_norm": 9.867119789123535, "learning_rate": 2.7758899676375405e-05, - "loss": 2.0227, + "loss": 0.7892, "step": 2750 }, { "epoch": 13.592233009708737, - "grad_norm": 10.520242691040039, + "grad_norm": 10.443971633911133, "learning_rate": 2.735436893203884e-05, - "loss": 2.0579, + "loss": 0.8432, "step": 2800 }, { "epoch": 13.83495145631068, - "grad_norm": 11.176478385925293, + "grad_norm": 10.633298873901367, "learning_rate": 2.6949838187702265e-05, - "loss": 2.0934, + "loss": 0.8543, "step": 2850 }, { "epoch": 14.077669902912621, - "grad_norm": 9.541868209838867, + "grad_norm": 8.640830039978027, "learning_rate": 2.6545307443365695e-05, - "loss": 2.0118, + "loss": 0.7751, "step": 2900 }, { "epoch": 14.320388349514563, - "grad_norm": 10.162981986999512, + "grad_norm": 10.461247444152832, "learning_rate": 2.614077669902913e-05, - "loss": 1.8624, + "loss": 0.6074, "step": 2950 }, { "epoch": 14.563106796116505, - "grad_norm": 10.51291275024414, + "grad_norm": 10.757479667663574, "learning_rate": 2.5736245954692555e-05, - "loss": 1.8979, + "loss": 0.6354, "step": 3000 }, { "epoch": 14.805825242718447, - "grad_norm": 11.316717147827148, + "grad_norm": 11.599132537841797, "learning_rate": 2.533171521035599e-05, - "loss": 1.915, + "loss": 0.6694, "step": 3050 }, { "epoch": 15.048543689320388, - "grad_norm": 9.184353828430176, + "grad_norm": 8.085358619689941, "learning_rate": 2.492718446601942e-05, - "loss": 1.885, + "loss": 0.6306, "step": 3100 }, { "epoch": 15.29126213592233, - "grad_norm": 10.70876693725586, + "grad_norm": 8.327988624572754, "learning_rate": 2.452265372168285e-05, - "loss": 1.7024, + "loss": 0.4628, "step": 3150 }, { "epoch": 15.533980582524272, - "grad_norm": 10.767953872680664, + "grad_norm": 8.545391082763672, "learning_rate": 2.411812297734628e-05, - "loss": 1.729, + "loss": 0.4915, "step": 3200 }, { "epoch": 15.776699029126213, - "grad_norm": 11.83687973022461, + "grad_norm": 10.176375389099121, "learning_rate": 2.3713592233009708e-05, - "loss": 1.7697, + "loss": 0.5105, "step": 3250 }, { "epoch": 16.019417475728154, - "grad_norm": 9.888689994812012, + "grad_norm": 7.821159362792969, "learning_rate": 2.3309061488673138e-05, - "loss": 1.7605, + "loss": 0.5055, "step": 3300 }, { "epoch": 16.262135922330096, - "grad_norm": 9.394012451171875, + "grad_norm": 7.797800064086914, "learning_rate": 2.290453074433657e-05, - "loss": 1.5389, + "loss": 0.3486, "step": 3350 }, { "epoch": 16.50485436893204, - "grad_norm": 11.3172025680542, + "grad_norm": 8.497370719909668, "learning_rate": 2.25e-05, - "loss": 1.5836, + "loss": 0.3627, "step": 3400 }, { "epoch": 16.74757281553398, - "grad_norm": 11.000115394592285, + "grad_norm": 9.842249870300293, "learning_rate": 2.209546925566343e-05, - "loss": 1.6398, + "loss": 0.3969, "step": 3450 }, { "epoch": 16.990291262135923, - "grad_norm": 11.453404426574707, + "grad_norm": 10.705739974975586, "learning_rate": 2.169093851132686e-05, - "loss": 1.6466, + "loss": 0.4028, "step": 3500 } ], @@ -517,7 +517,7 @@ "attributes": {} } }, - "total_flos": 457269313536000.0, + "total_flos": 1625237225472000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-3500/training_args.bin b/checkpoints/checkpoint-3500/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-3500/training_args.bin +++ b/checkpoints/checkpoint-3500/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/checkpoints/checkpoint-4000/config.json b/checkpoints/checkpoint-4000/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-4000/config.json +++ b/checkpoints/checkpoint-4000/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-4000/model.safetensors b/checkpoints/checkpoint-4000/model.safetensors index c43bc1bbcf624fb7954207f83e81876212551933..2339bcc48988481c9d29790d3f7dc9edadb5e14a 100644 --- a/checkpoints/checkpoint-4000/model.safetensors +++ b/checkpoints/checkpoint-4000/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6dec913fb019f72d2b4e5ba61a3e03faf3f77c63613f93c6c44239205901dc9 -size 176343496 +oid sha256:1c104ff20a7d8e0674d2df872410a78a14262bdb171fe8770ebf6ad6cb47abe3 +size 617130824 diff --git a/checkpoints/checkpoint-4000/optimizer.pt b/checkpoints/checkpoint-4000/optimizer.pt index f818b6875c679364252650ff5d703fa3158cdd24..fb72cd8eb5713e14f37cbb2d0ab4ca99e167bba4 100644 --- a/checkpoints/checkpoint-4000/optimizer.pt +++ b/checkpoints/checkpoint-4000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e578e56f38303973d77fb3e27d42c6872cb40d601b37756b09c847fa9d0eb4a -size 352735610 +oid sha256:62e74ed7e1c6f81b24fe69e4ef050494e5a8e9227ed6983c8647dc5fbd0d0ade +size 1234355130 diff --git a/checkpoints/checkpoint-4000/rng_state.pth b/checkpoints/checkpoint-4000/rng_state.pth index df765ea7a08c0bec625de4e274c9cb83fc04e121..422444df8ce703d242e9559d3ccebbd1a8839656 100644 --- a/checkpoints/checkpoint-4000/rng_state.pth +++ b/checkpoints/checkpoint-4000/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb15039547827812ba56408e84af91a8186a7d1541bf239173c9fa71b57742c8 +oid sha256:2b2997b95ac6c0d6711e7d18cba7213a8b49deca7976b4a4ecbf3d8da67023b1 size 14244 diff --git a/checkpoints/checkpoint-4000/trainer_state.json b/checkpoints/checkpoint-4000/trainer_state.json index b38877443814905eb9627c4173394e6eca965b89..a647be9c8f9ec138aebec3c68cab246260170303 100644 --- a/checkpoints/checkpoint-4000/trainer_state.json +++ b/checkpoints/checkpoint-4000/trainer_state.json @@ -11,562 +11,562 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 }, { "epoch": 2.6699029126213594, - "grad_norm": 5.35228157043457, + "grad_norm": 5.231846809387207, "learning_rate": 4.555825242718447e-05, - "loss": 4.1612, + "loss": 4.0662, "step": 550 }, { "epoch": 2.912621359223301, - "grad_norm": 5.287161827087402, + "grad_norm": 5.506278038024902, "learning_rate": 4.51537216828479e-05, - "loss": 4.1322, + "loss": 4.0403, "step": 600 }, { "epoch": 3.1553398058252426, - "grad_norm": 5.2932209968566895, + "grad_norm": 5.410216331481934, "learning_rate": 4.4749190938511334e-05, - "loss": 3.9684, + "loss": 3.8375, "step": 650 }, { "epoch": 3.3980582524271843, - "grad_norm": 5.4360833168029785, + "grad_norm": 5.637699127197266, "learning_rate": 4.434466019417476e-05, - "loss": 3.8933, + "loss": 3.7473, "step": 700 }, { "epoch": 3.6407766990291264, - "grad_norm": 5.5595221519470215, + "grad_norm": 5.575273513793945, "learning_rate": 4.3940129449838194e-05, - "loss": 3.9306, + "loss": 3.7835, "step": 750 }, { "epoch": 3.883495145631068, - "grad_norm": 5.540086269378662, + "grad_norm": 5.737198352813721, "learning_rate": 4.353559870550162e-05, - "loss": 3.9317, + "loss": 3.7919, "step": 800 }, { "epoch": 4.12621359223301, - "grad_norm": 5.567137718200684, + "grad_norm": 5.645532608032227, "learning_rate": 4.313106796116505e-05, - "loss": 3.8135, + "loss": 3.6425, "step": 850 }, { "epoch": 4.368932038834951, - "grad_norm": 6.00346040725708, + "grad_norm": 6.308927059173584, "learning_rate": 4.272653721682848e-05, - "loss": 3.6924, + "loss": 3.4785, "step": 900 }, { "epoch": 4.611650485436893, - "grad_norm": 5.635782718658447, + "grad_norm": 5.863094329833984, "learning_rate": 4.232200647249191e-05, - "loss": 3.7304, + "loss": 3.5268, "step": 950 }, { "epoch": 4.854368932038835, - "grad_norm": 5.9545440673828125, + "grad_norm": 6.120258331298828, "learning_rate": 4.191747572815534e-05, - "loss": 3.7471, + "loss": 3.5296, "step": 1000 }, { "epoch": 5.097087378640777, - "grad_norm": 5.696166038513184, + "grad_norm": 6.231515884399414, "learning_rate": 4.1512944983818774e-05, - "loss": 3.6287, + "loss": 3.3802, "step": 1050 }, { "epoch": 5.339805825242719, - "grad_norm": 5.844620704650879, + "grad_norm": 6.211188316345215, "learning_rate": 4.11084142394822e-05, - "loss": 3.5299, + "loss": 3.2148, "step": 1100 }, { "epoch": 5.58252427184466, - "grad_norm": 6.373577117919922, + "grad_norm": 6.8676934242248535, "learning_rate": 4.0703883495145634e-05, - "loss": 3.5285, + "loss": 3.2111, "step": 1150 }, { "epoch": 5.825242718446602, - "grad_norm": 6.488651275634766, + "grad_norm": 6.851133346557617, "learning_rate": 4.029935275080906e-05, - "loss": 3.5789, + "loss": 3.2771, "step": 1200 }, { "epoch": 6.067961165048544, - "grad_norm": 6.170980930328369, + "grad_norm": 6.708765506744385, "learning_rate": 3.9894822006472494e-05, - "loss": 3.494, + "loss": 3.1587, "step": 1250 }, { "epoch": 6.310679611650485, - "grad_norm": 6.387782573699951, + "grad_norm": 7.136553764343262, "learning_rate": 3.949029126213593e-05, - "loss": 3.353, + "loss": 2.9192, "step": 1300 }, { "epoch": 6.553398058252427, - "grad_norm": 6.718175888061523, + "grad_norm": 7.370823383331299, "learning_rate": 3.9085760517799354e-05, - "loss": 3.3521, + "loss": 2.922, "step": 1350 }, { "epoch": 6.796116504854369, - "grad_norm": 6.7870025634765625, + "grad_norm": 7.630361557006836, "learning_rate": 3.868122977346279e-05, - "loss": 3.3658, + "loss": 2.9417, "step": 1400 }, { "epoch": 7.038834951456311, - "grad_norm": 6.513721466064453, + "grad_norm": 7.765511989593506, "learning_rate": 3.827669902912622e-05, - "loss": 3.3301, + "loss": 2.8669, "step": 1450 }, { "epoch": 7.281553398058253, - "grad_norm": 7.0680036544799805, + "grad_norm": 7.85439920425415, "learning_rate": 3.787216828478965e-05, - "loss": 3.1289, + "loss": 2.5412, "step": 1500 }, { "epoch": 7.524271844660194, - "grad_norm": 7.019896507263184, + "grad_norm": 8.064071655273438, "learning_rate": 3.746763754045307e-05, - "loss": 3.1878, + "loss": 2.6131, "step": 1550 }, { "epoch": 7.766990291262136, - "grad_norm": 7.3232197761535645, + "grad_norm": 8.692522048950195, "learning_rate": 3.7063106796116507e-05, - "loss": 3.2006, + "loss": 2.6356, "step": 1600 }, { "epoch": 8.009708737864077, - "grad_norm": 7.003912925720215, + "grad_norm": 8.012410163879395, "learning_rate": 3.665857605177993e-05, - "loss": 3.1737, + "loss": 2.599, "step": 1650 }, { "epoch": 8.25242718446602, - "grad_norm": 7.592433929443359, + "grad_norm": 8.6799898147583, "learning_rate": 3.6254045307443366e-05, - "loss": 2.9258, + "loss": 2.1992, "step": 1700 }, { "epoch": 8.495145631067961, - "grad_norm": 7.629278182983398, + "grad_norm": 8.963507652282715, "learning_rate": 3.584951456310679e-05, - "loss": 2.9999, + "loss": 2.2701, "step": 1750 }, { "epoch": 8.737864077669903, - "grad_norm": 8.137636184692383, + "grad_norm": 9.833822250366211, "learning_rate": 3.5444983818770226e-05, - "loss": 3.0155, + "loss": 2.2837, "step": 1800 }, { "epoch": 8.980582524271846, - "grad_norm": 7.967227935791016, + "grad_norm": 9.47242259979248, "learning_rate": 3.504045307443366e-05, - "loss": 2.9945, + "loss": 2.2854, "step": 1850 }, { "epoch": 9.223300970873787, - "grad_norm": 7.9910078048706055, + "grad_norm": 8.856402397155762, "learning_rate": 3.4635922330097086e-05, - "loss": 2.7456, + "loss": 1.8862, "step": 1900 }, { "epoch": 9.466019417475728, - "grad_norm": 8.552720069885254, + "grad_norm": 9.948341369628906, "learning_rate": 3.423139158576052e-05, - "loss": 2.8071, + "loss": 1.922, "step": 1950 }, { "epoch": 9.70873786407767, - "grad_norm": 8.336930274963379, + "grad_norm": 10.28939151763916, "learning_rate": 3.382686084142395e-05, - "loss": 2.8117, + "loss": 1.9622, "step": 2000 }, { "epoch": 9.951456310679612, - "grad_norm": 8.636338233947754, + "grad_norm": 10.114766120910645, "learning_rate": 3.342233009708738e-05, - "loss": 2.8197, + "loss": 1.9741, "step": 2050 }, { "epoch": 10.194174757281553, - "grad_norm": 7.955165863037109, + "grad_norm": 9.489270210266113, "learning_rate": 3.301779935275081e-05, - "loss": 2.6245, + "loss": 1.6509, "step": 2100 }, { "epoch": 10.436893203883495, - "grad_norm": 8.679149627685547, + "grad_norm": 10.16482162475586, "learning_rate": 3.2613268608414246e-05, - "loss": 2.565, + "loss": 1.5883, "step": 2150 }, { "epoch": 10.679611650485437, - "grad_norm": 9.150839805603027, + "grad_norm": 11.778059005737305, "learning_rate": 3.220873786407767e-05, - "loss": 2.6258, + "loss": 1.6234, "step": 2200 }, { "epoch": 10.922330097087379, - "grad_norm": 9.2735013961792, + "grad_norm": 10.798454284667969, "learning_rate": 3.1804207119741106e-05, - "loss": 2.6439, + "loss": 1.667, "step": 2250 }, { "epoch": 11.16504854368932, - "grad_norm": 8.620939254760742, + "grad_norm": 9.751585006713867, "learning_rate": 3.139967637540453e-05, - "loss": 2.4371, + "loss": 1.3785, "step": 2300 }, { "epoch": 11.407766990291263, - "grad_norm": 9.722040176391602, + "grad_norm": 10.266364097595215, "learning_rate": 3.099514563106796e-05, - "loss": 2.4074, + "loss": 1.2956, "step": 2350 }, { "epoch": 11.650485436893204, - "grad_norm": 9.200730323791504, + "grad_norm": 10.660856246948242, "learning_rate": 3.059061488673139e-05, - "loss": 2.4307, + "loss": 1.3321, "step": 2400 }, { "epoch": 11.893203883495145, - "grad_norm": 9.692273139953613, + "grad_norm": 10.896187782287598, "learning_rate": 3.0186084142394822e-05, - "loss": 2.424, + "loss": 1.3533, "step": 2450 }, { "epoch": 12.135922330097088, - "grad_norm": 9.822249412536621, + "grad_norm": 10.952502250671387, "learning_rate": 2.9781553398058252e-05, - "loss": 2.2862, + "loss": 1.1486, "step": 2500 }, { "epoch": 12.37864077669903, - "grad_norm": 9.919060707092285, + "grad_norm": 10.766463279724121, "learning_rate": 2.9377022653721686e-05, - "loss": 2.1913, + "loss": 1.0227, "step": 2550 }, { "epoch": 12.62135922330097, - "grad_norm": 9.181950569152832, + "grad_norm": 9.385764122009277, "learning_rate": 2.8972491909385112e-05, - "loss": 2.241, + "loss": 1.0651, "step": 2600 }, { "epoch": 12.864077669902912, - "grad_norm": 10.82077407836914, + "grad_norm": 11.573925018310547, "learning_rate": 2.8567961165048546e-05, - "loss": 2.2653, + "loss": 1.1013, "step": 2650 }, { "epoch": 13.106796116504855, - "grad_norm": 10.365830421447754, + "grad_norm": 10.138091087341309, "learning_rate": 2.816343042071198e-05, - "loss": 2.1352, + "loss": 0.9545, "step": 2700 }, { "epoch": 13.349514563106796, - "grad_norm": 10.073640823364258, + "grad_norm": 9.867119789123535, "learning_rate": 2.7758899676375405e-05, - "loss": 2.0227, + "loss": 0.7892, "step": 2750 }, { "epoch": 13.592233009708737, - "grad_norm": 10.520242691040039, + "grad_norm": 10.443971633911133, "learning_rate": 2.735436893203884e-05, - "loss": 2.0579, + "loss": 0.8432, "step": 2800 }, { "epoch": 13.83495145631068, - "grad_norm": 11.176478385925293, + "grad_norm": 10.633298873901367, "learning_rate": 2.6949838187702265e-05, - "loss": 2.0934, + "loss": 0.8543, "step": 2850 }, { "epoch": 14.077669902912621, - "grad_norm": 9.541868209838867, + "grad_norm": 8.640830039978027, "learning_rate": 2.6545307443365695e-05, - "loss": 2.0118, + "loss": 0.7751, "step": 2900 }, { "epoch": 14.320388349514563, - "grad_norm": 10.162981986999512, + "grad_norm": 10.461247444152832, "learning_rate": 2.614077669902913e-05, - "loss": 1.8624, + "loss": 0.6074, "step": 2950 }, { "epoch": 14.563106796116505, - "grad_norm": 10.51291275024414, + "grad_norm": 10.757479667663574, "learning_rate": 2.5736245954692555e-05, - "loss": 1.8979, + "loss": 0.6354, "step": 3000 }, { "epoch": 14.805825242718447, - "grad_norm": 11.316717147827148, + "grad_norm": 11.599132537841797, "learning_rate": 2.533171521035599e-05, - "loss": 1.915, + "loss": 0.6694, "step": 3050 }, { "epoch": 15.048543689320388, - "grad_norm": 9.184353828430176, + "grad_norm": 8.085358619689941, "learning_rate": 2.492718446601942e-05, - "loss": 1.885, + "loss": 0.6306, "step": 3100 }, { "epoch": 15.29126213592233, - "grad_norm": 10.70876693725586, + "grad_norm": 8.327988624572754, "learning_rate": 2.452265372168285e-05, - "loss": 1.7024, + "loss": 0.4628, "step": 3150 }, { "epoch": 15.533980582524272, - "grad_norm": 10.767953872680664, + "grad_norm": 8.545391082763672, "learning_rate": 2.411812297734628e-05, - "loss": 1.729, + "loss": 0.4915, "step": 3200 }, { "epoch": 15.776699029126213, - "grad_norm": 11.83687973022461, + "grad_norm": 10.176375389099121, "learning_rate": 2.3713592233009708e-05, - "loss": 1.7697, + "loss": 0.5105, "step": 3250 }, { "epoch": 16.019417475728154, - "grad_norm": 9.888689994812012, + "grad_norm": 7.821159362792969, "learning_rate": 2.3309061488673138e-05, - "loss": 1.7605, + "loss": 0.5055, "step": 3300 }, { "epoch": 16.262135922330096, - "grad_norm": 9.394012451171875, + "grad_norm": 7.797800064086914, "learning_rate": 2.290453074433657e-05, - "loss": 1.5389, + "loss": 0.3486, "step": 3350 }, { "epoch": 16.50485436893204, - "grad_norm": 11.3172025680542, + "grad_norm": 8.497370719909668, "learning_rate": 2.25e-05, - "loss": 1.5836, + "loss": 0.3627, "step": 3400 }, { "epoch": 16.74757281553398, - "grad_norm": 11.000115394592285, + "grad_norm": 9.842249870300293, "learning_rate": 2.209546925566343e-05, - "loss": 1.6398, + "loss": 0.3969, "step": 3450 }, { "epoch": 16.990291262135923, - "grad_norm": 11.453404426574707, + "grad_norm": 10.705739974975586, "learning_rate": 2.169093851132686e-05, - "loss": 1.6466, + "loss": 0.4028, "step": 3500 }, { "epoch": 17.233009708737864, - "grad_norm": 11.012279510498047, + "grad_norm": 7.950681686401367, "learning_rate": 2.1286407766990295e-05, - "loss": 1.4434, + "loss": 0.284, "step": 3550 }, { "epoch": 17.475728155339805, - "grad_norm": 11.989383697509766, + "grad_norm": 8.526535034179688, "learning_rate": 2.0881877022653725e-05, - "loss": 1.4708, + "loss": 0.2934, "step": 3600 }, { "epoch": 17.718446601941746, - "grad_norm": 12.47008991241455, + "grad_norm": 8.999290466308594, "learning_rate": 2.047734627831715e-05, - "loss": 1.4783, + "loss": 0.2964, "step": 3650 }, { "epoch": 17.96116504854369, - "grad_norm": 10.983756065368652, + "grad_norm": 9.228137969970703, "learning_rate": 2.007281553398058e-05, - "loss": 1.4839, + "loss": 0.311, "step": 3700 }, { "epoch": 18.203883495145632, - "grad_norm": 9.582510948181152, + "grad_norm": 6.715802192687988, "learning_rate": 1.9668284789644014e-05, - "loss": 1.3226, + "loss": 0.2263, "step": 3750 }, { "epoch": 18.446601941747574, - "grad_norm": 11.663228034973145, + "grad_norm": 7.834264278411865, "learning_rate": 1.9263754045307444e-05, - "loss": 1.3522, + "loss": 0.2293, "step": 3800 }, { "epoch": 18.689320388349515, - "grad_norm": 11.278139114379883, + "grad_norm": 6.83228874206543, "learning_rate": 1.8859223300970874e-05, - "loss": 1.3753, + "loss": 0.2392, "step": 3850 }, { "epoch": 18.932038834951456, - "grad_norm": 10.586488723754883, + "grad_norm": 8.507209777832031, "learning_rate": 1.8454692556634304e-05, - "loss": 1.3774, + "loss": 0.2377, "step": 3900 }, { "epoch": 19.174757281553397, - "grad_norm": 11.243274688720703, + "grad_norm": 6.565855026245117, "learning_rate": 1.8050161812297738e-05, - "loss": 1.2506, + "loss": 0.1905, "step": 3950 }, { "epoch": 19.41747572815534, - "grad_norm": 10.601967811584473, + "grad_norm": 7.470388889312744, "learning_rate": 1.7645631067961167e-05, - "loss": 1.2276, + "loss": 0.1846, "step": 4000 } ], @@ -587,7 +587,7 @@ "attributes": {} } }, - "total_flos": 522593501184000.0, + "total_flos": 1857413971968000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-4000/training_args.bin b/checkpoints/checkpoint-4000/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-4000/training_args.bin +++ b/checkpoints/checkpoint-4000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/checkpoints/checkpoint-4500/config.json b/checkpoints/checkpoint-4500/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-4500/config.json +++ b/checkpoints/checkpoint-4500/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-4500/model.safetensors b/checkpoints/checkpoint-4500/model.safetensors index 008c5054dc494531a1cbd90090d190ea701a13f6..423759d5ca1f31b548237267d96fea132739f99a 100644 --- a/checkpoints/checkpoint-4500/model.safetensors +++ b/checkpoints/checkpoint-4500/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b51a1bf42be97dd74f6362135ec6a10b42f4ea2ee64da3e8045792e198567c0d -size 176343496 +oid sha256:ebb0541da09e5cfb609d483adeef00bc2ad0501c3ae4444273ec13e5ec19556b +size 617130824 diff --git a/checkpoints/checkpoint-4500/optimizer.pt b/checkpoints/checkpoint-4500/optimizer.pt index d325a086cbdeefbcf5dc4efbc6703c3f33766d3d..9bc4eb68b8cbe47d880ceff9a3cf5f3c31df8d37 100644 --- a/checkpoints/checkpoint-4500/optimizer.pt +++ b/checkpoints/checkpoint-4500/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5627ce537e783ab06ced7a600ccfa407d8fc413d26712840d8afd6f239fb207 -size 352735610 +oid sha256:dc325834365b49ed70544206aee634b00dc3bc67405ea69b0c181cd595db3143 +size 1234355130 diff --git a/checkpoints/checkpoint-4500/rng_state.pth b/checkpoints/checkpoint-4500/rng_state.pth index 4aab50e379c4b091a7216206abe0395dc5df2669..0db7aaca68562bc4ee12136f92d77e4c7859c582 100644 --- a/checkpoints/checkpoint-4500/rng_state.pth +++ b/checkpoints/checkpoint-4500/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f8adba1ade46806b0e67f011b8bb3e651b7163059c44b5b3173df73c2b84a59 +oid sha256:6ba93584c0da2ef2f3f0c556dbddf6f03b4d82613be3aa5b8fb3c23d16b4d966 size 14244 diff --git a/checkpoints/checkpoint-4500/trainer_state.json b/checkpoints/checkpoint-4500/trainer_state.json index f9a8e764c2a6690715fd434af99054de69192ef5..89af167140bb8cf1c725c8649b3934930945d3d3 100644 --- a/checkpoints/checkpoint-4500/trainer_state.json +++ b/checkpoints/checkpoint-4500/trainer_state.json @@ -11,632 +11,632 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 }, { "epoch": 2.6699029126213594, - "grad_norm": 5.35228157043457, + "grad_norm": 5.231846809387207, "learning_rate": 4.555825242718447e-05, - "loss": 4.1612, + "loss": 4.0662, "step": 550 }, { "epoch": 2.912621359223301, - "grad_norm": 5.287161827087402, + "grad_norm": 5.506278038024902, "learning_rate": 4.51537216828479e-05, - "loss": 4.1322, + "loss": 4.0403, "step": 600 }, { "epoch": 3.1553398058252426, - "grad_norm": 5.2932209968566895, + "grad_norm": 5.410216331481934, "learning_rate": 4.4749190938511334e-05, - "loss": 3.9684, + "loss": 3.8375, "step": 650 }, { "epoch": 3.3980582524271843, - "grad_norm": 5.4360833168029785, + "grad_norm": 5.637699127197266, "learning_rate": 4.434466019417476e-05, - "loss": 3.8933, + "loss": 3.7473, "step": 700 }, { "epoch": 3.6407766990291264, - "grad_norm": 5.5595221519470215, + "grad_norm": 5.575273513793945, "learning_rate": 4.3940129449838194e-05, - "loss": 3.9306, + "loss": 3.7835, "step": 750 }, { "epoch": 3.883495145631068, - "grad_norm": 5.540086269378662, + "grad_norm": 5.737198352813721, "learning_rate": 4.353559870550162e-05, - "loss": 3.9317, + "loss": 3.7919, "step": 800 }, { "epoch": 4.12621359223301, - "grad_norm": 5.567137718200684, + "grad_norm": 5.645532608032227, "learning_rate": 4.313106796116505e-05, - "loss": 3.8135, + "loss": 3.6425, "step": 850 }, { "epoch": 4.368932038834951, - "grad_norm": 6.00346040725708, + "grad_norm": 6.308927059173584, "learning_rate": 4.272653721682848e-05, - "loss": 3.6924, + "loss": 3.4785, "step": 900 }, { "epoch": 4.611650485436893, - "grad_norm": 5.635782718658447, + "grad_norm": 5.863094329833984, "learning_rate": 4.232200647249191e-05, - "loss": 3.7304, + "loss": 3.5268, "step": 950 }, { "epoch": 4.854368932038835, - "grad_norm": 5.9545440673828125, + "grad_norm": 6.120258331298828, "learning_rate": 4.191747572815534e-05, - "loss": 3.7471, + "loss": 3.5296, "step": 1000 }, { "epoch": 5.097087378640777, - "grad_norm": 5.696166038513184, + "grad_norm": 6.231515884399414, "learning_rate": 4.1512944983818774e-05, - "loss": 3.6287, + "loss": 3.3802, "step": 1050 }, { "epoch": 5.339805825242719, - "grad_norm": 5.844620704650879, + "grad_norm": 6.211188316345215, "learning_rate": 4.11084142394822e-05, - "loss": 3.5299, + "loss": 3.2148, "step": 1100 }, { "epoch": 5.58252427184466, - "grad_norm": 6.373577117919922, + "grad_norm": 6.8676934242248535, "learning_rate": 4.0703883495145634e-05, - "loss": 3.5285, + "loss": 3.2111, "step": 1150 }, { "epoch": 5.825242718446602, - "grad_norm": 6.488651275634766, + "grad_norm": 6.851133346557617, "learning_rate": 4.029935275080906e-05, - "loss": 3.5789, + "loss": 3.2771, "step": 1200 }, { "epoch": 6.067961165048544, - "grad_norm": 6.170980930328369, + "grad_norm": 6.708765506744385, "learning_rate": 3.9894822006472494e-05, - "loss": 3.494, + "loss": 3.1587, "step": 1250 }, { "epoch": 6.310679611650485, - "grad_norm": 6.387782573699951, + "grad_norm": 7.136553764343262, "learning_rate": 3.949029126213593e-05, - "loss": 3.353, + "loss": 2.9192, "step": 1300 }, { "epoch": 6.553398058252427, - "grad_norm": 6.718175888061523, + "grad_norm": 7.370823383331299, "learning_rate": 3.9085760517799354e-05, - "loss": 3.3521, + "loss": 2.922, "step": 1350 }, { "epoch": 6.796116504854369, - "grad_norm": 6.7870025634765625, + "grad_norm": 7.630361557006836, "learning_rate": 3.868122977346279e-05, - "loss": 3.3658, + "loss": 2.9417, "step": 1400 }, { "epoch": 7.038834951456311, - "grad_norm": 6.513721466064453, + "grad_norm": 7.765511989593506, "learning_rate": 3.827669902912622e-05, - "loss": 3.3301, + "loss": 2.8669, "step": 1450 }, { "epoch": 7.281553398058253, - "grad_norm": 7.0680036544799805, + "grad_norm": 7.85439920425415, "learning_rate": 3.787216828478965e-05, - "loss": 3.1289, + "loss": 2.5412, "step": 1500 }, { "epoch": 7.524271844660194, - "grad_norm": 7.019896507263184, + "grad_norm": 8.064071655273438, "learning_rate": 3.746763754045307e-05, - "loss": 3.1878, + "loss": 2.6131, "step": 1550 }, { "epoch": 7.766990291262136, - "grad_norm": 7.3232197761535645, + "grad_norm": 8.692522048950195, "learning_rate": 3.7063106796116507e-05, - "loss": 3.2006, + "loss": 2.6356, "step": 1600 }, { "epoch": 8.009708737864077, - "grad_norm": 7.003912925720215, + "grad_norm": 8.012410163879395, "learning_rate": 3.665857605177993e-05, - "loss": 3.1737, + "loss": 2.599, "step": 1650 }, { "epoch": 8.25242718446602, - "grad_norm": 7.592433929443359, + "grad_norm": 8.6799898147583, "learning_rate": 3.6254045307443366e-05, - "loss": 2.9258, + "loss": 2.1992, "step": 1700 }, { "epoch": 8.495145631067961, - "grad_norm": 7.629278182983398, + "grad_norm": 8.963507652282715, "learning_rate": 3.584951456310679e-05, - "loss": 2.9999, + "loss": 2.2701, "step": 1750 }, { "epoch": 8.737864077669903, - "grad_norm": 8.137636184692383, + "grad_norm": 9.833822250366211, "learning_rate": 3.5444983818770226e-05, - "loss": 3.0155, + "loss": 2.2837, "step": 1800 }, { "epoch": 8.980582524271846, - "grad_norm": 7.967227935791016, + "grad_norm": 9.47242259979248, "learning_rate": 3.504045307443366e-05, - "loss": 2.9945, + "loss": 2.2854, "step": 1850 }, { "epoch": 9.223300970873787, - "grad_norm": 7.9910078048706055, + "grad_norm": 8.856402397155762, "learning_rate": 3.4635922330097086e-05, - "loss": 2.7456, + "loss": 1.8862, "step": 1900 }, { "epoch": 9.466019417475728, - "grad_norm": 8.552720069885254, + "grad_norm": 9.948341369628906, "learning_rate": 3.423139158576052e-05, - "loss": 2.8071, + "loss": 1.922, "step": 1950 }, { "epoch": 9.70873786407767, - "grad_norm": 8.336930274963379, + "grad_norm": 10.28939151763916, "learning_rate": 3.382686084142395e-05, - "loss": 2.8117, + "loss": 1.9622, "step": 2000 }, { "epoch": 9.951456310679612, - "grad_norm": 8.636338233947754, + "grad_norm": 10.114766120910645, "learning_rate": 3.342233009708738e-05, - "loss": 2.8197, + "loss": 1.9741, "step": 2050 }, { "epoch": 10.194174757281553, - "grad_norm": 7.955165863037109, + "grad_norm": 9.489270210266113, "learning_rate": 3.301779935275081e-05, - "loss": 2.6245, + "loss": 1.6509, "step": 2100 }, { "epoch": 10.436893203883495, - "grad_norm": 8.679149627685547, + "grad_norm": 10.16482162475586, "learning_rate": 3.2613268608414246e-05, - "loss": 2.565, + "loss": 1.5883, "step": 2150 }, { "epoch": 10.679611650485437, - "grad_norm": 9.150839805603027, + "grad_norm": 11.778059005737305, "learning_rate": 3.220873786407767e-05, - "loss": 2.6258, + "loss": 1.6234, "step": 2200 }, { "epoch": 10.922330097087379, - "grad_norm": 9.2735013961792, + "grad_norm": 10.798454284667969, "learning_rate": 3.1804207119741106e-05, - "loss": 2.6439, + "loss": 1.667, "step": 2250 }, { "epoch": 11.16504854368932, - "grad_norm": 8.620939254760742, + "grad_norm": 9.751585006713867, "learning_rate": 3.139967637540453e-05, - "loss": 2.4371, + "loss": 1.3785, "step": 2300 }, { "epoch": 11.407766990291263, - "grad_norm": 9.722040176391602, + "grad_norm": 10.266364097595215, "learning_rate": 3.099514563106796e-05, - "loss": 2.4074, + "loss": 1.2956, "step": 2350 }, { "epoch": 11.650485436893204, - "grad_norm": 9.200730323791504, + "grad_norm": 10.660856246948242, "learning_rate": 3.059061488673139e-05, - "loss": 2.4307, + "loss": 1.3321, "step": 2400 }, { "epoch": 11.893203883495145, - "grad_norm": 9.692273139953613, + "grad_norm": 10.896187782287598, "learning_rate": 3.0186084142394822e-05, - "loss": 2.424, + "loss": 1.3533, "step": 2450 }, { "epoch": 12.135922330097088, - "grad_norm": 9.822249412536621, + "grad_norm": 10.952502250671387, "learning_rate": 2.9781553398058252e-05, - "loss": 2.2862, + "loss": 1.1486, "step": 2500 }, { "epoch": 12.37864077669903, - "grad_norm": 9.919060707092285, + "grad_norm": 10.766463279724121, "learning_rate": 2.9377022653721686e-05, - "loss": 2.1913, + "loss": 1.0227, "step": 2550 }, { "epoch": 12.62135922330097, - "grad_norm": 9.181950569152832, + "grad_norm": 9.385764122009277, "learning_rate": 2.8972491909385112e-05, - "loss": 2.241, + "loss": 1.0651, "step": 2600 }, { "epoch": 12.864077669902912, - "grad_norm": 10.82077407836914, + "grad_norm": 11.573925018310547, "learning_rate": 2.8567961165048546e-05, - "loss": 2.2653, + "loss": 1.1013, "step": 2650 }, { "epoch": 13.106796116504855, - "grad_norm": 10.365830421447754, + "grad_norm": 10.138091087341309, "learning_rate": 2.816343042071198e-05, - "loss": 2.1352, + "loss": 0.9545, "step": 2700 }, { "epoch": 13.349514563106796, - "grad_norm": 10.073640823364258, + "grad_norm": 9.867119789123535, "learning_rate": 2.7758899676375405e-05, - "loss": 2.0227, + "loss": 0.7892, "step": 2750 }, { "epoch": 13.592233009708737, - "grad_norm": 10.520242691040039, + "grad_norm": 10.443971633911133, "learning_rate": 2.735436893203884e-05, - "loss": 2.0579, + "loss": 0.8432, "step": 2800 }, { "epoch": 13.83495145631068, - "grad_norm": 11.176478385925293, + "grad_norm": 10.633298873901367, "learning_rate": 2.6949838187702265e-05, - "loss": 2.0934, + "loss": 0.8543, "step": 2850 }, { "epoch": 14.077669902912621, - "grad_norm": 9.541868209838867, + "grad_norm": 8.640830039978027, "learning_rate": 2.6545307443365695e-05, - "loss": 2.0118, + "loss": 0.7751, "step": 2900 }, { "epoch": 14.320388349514563, - "grad_norm": 10.162981986999512, + "grad_norm": 10.461247444152832, "learning_rate": 2.614077669902913e-05, - "loss": 1.8624, + "loss": 0.6074, "step": 2950 }, { "epoch": 14.563106796116505, - "grad_norm": 10.51291275024414, + "grad_norm": 10.757479667663574, "learning_rate": 2.5736245954692555e-05, - "loss": 1.8979, + "loss": 0.6354, "step": 3000 }, { "epoch": 14.805825242718447, - "grad_norm": 11.316717147827148, + "grad_norm": 11.599132537841797, "learning_rate": 2.533171521035599e-05, - "loss": 1.915, + "loss": 0.6694, "step": 3050 }, { "epoch": 15.048543689320388, - "grad_norm": 9.184353828430176, + "grad_norm": 8.085358619689941, "learning_rate": 2.492718446601942e-05, - "loss": 1.885, + "loss": 0.6306, "step": 3100 }, { "epoch": 15.29126213592233, - "grad_norm": 10.70876693725586, + "grad_norm": 8.327988624572754, "learning_rate": 2.452265372168285e-05, - "loss": 1.7024, + "loss": 0.4628, "step": 3150 }, { "epoch": 15.533980582524272, - "grad_norm": 10.767953872680664, + "grad_norm": 8.545391082763672, "learning_rate": 2.411812297734628e-05, - "loss": 1.729, + "loss": 0.4915, "step": 3200 }, { "epoch": 15.776699029126213, - "grad_norm": 11.83687973022461, + "grad_norm": 10.176375389099121, "learning_rate": 2.3713592233009708e-05, - "loss": 1.7697, + "loss": 0.5105, "step": 3250 }, { "epoch": 16.019417475728154, - "grad_norm": 9.888689994812012, + "grad_norm": 7.821159362792969, "learning_rate": 2.3309061488673138e-05, - "loss": 1.7605, + "loss": 0.5055, "step": 3300 }, { "epoch": 16.262135922330096, - "grad_norm": 9.394012451171875, + "grad_norm": 7.797800064086914, "learning_rate": 2.290453074433657e-05, - "loss": 1.5389, + "loss": 0.3486, "step": 3350 }, { "epoch": 16.50485436893204, - "grad_norm": 11.3172025680542, + "grad_norm": 8.497370719909668, "learning_rate": 2.25e-05, - "loss": 1.5836, + "loss": 0.3627, "step": 3400 }, { "epoch": 16.74757281553398, - "grad_norm": 11.000115394592285, + "grad_norm": 9.842249870300293, "learning_rate": 2.209546925566343e-05, - "loss": 1.6398, + "loss": 0.3969, "step": 3450 }, { "epoch": 16.990291262135923, - "grad_norm": 11.453404426574707, + "grad_norm": 10.705739974975586, "learning_rate": 2.169093851132686e-05, - "loss": 1.6466, + "loss": 0.4028, "step": 3500 }, { "epoch": 17.233009708737864, - "grad_norm": 11.012279510498047, + "grad_norm": 7.950681686401367, "learning_rate": 2.1286407766990295e-05, - "loss": 1.4434, + "loss": 0.284, "step": 3550 }, { "epoch": 17.475728155339805, - "grad_norm": 11.989383697509766, + "grad_norm": 8.526535034179688, "learning_rate": 2.0881877022653725e-05, - "loss": 1.4708, + "loss": 0.2934, "step": 3600 }, { "epoch": 17.718446601941746, - "grad_norm": 12.47008991241455, + "grad_norm": 8.999290466308594, "learning_rate": 2.047734627831715e-05, - "loss": 1.4783, + "loss": 0.2964, "step": 3650 }, { "epoch": 17.96116504854369, - "grad_norm": 10.983756065368652, + "grad_norm": 9.228137969970703, "learning_rate": 2.007281553398058e-05, - "loss": 1.4839, + "loss": 0.311, "step": 3700 }, { "epoch": 18.203883495145632, - "grad_norm": 9.582510948181152, + "grad_norm": 6.715802192687988, "learning_rate": 1.9668284789644014e-05, - "loss": 1.3226, + "loss": 0.2263, "step": 3750 }, { "epoch": 18.446601941747574, - "grad_norm": 11.663228034973145, + "grad_norm": 7.834264278411865, "learning_rate": 1.9263754045307444e-05, - "loss": 1.3522, + "loss": 0.2293, "step": 3800 }, { "epoch": 18.689320388349515, - "grad_norm": 11.278139114379883, + "grad_norm": 6.83228874206543, "learning_rate": 1.8859223300970874e-05, - "loss": 1.3753, + "loss": 0.2392, "step": 3850 }, { "epoch": 18.932038834951456, - "grad_norm": 10.586488723754883, + "grad_norm": 8.507209777832031, "learning_rate": 1.8454692556634304e-05, - "loss": 1.3774, + "loss": 0.2377, "step": 3900 }, { "epoch": 19.174757281553397, - "grad_norm": 11.243274688720703, + "grad_norm": 6.565855026245117, "learning_rate": 1.8050161812297738e-05, - "loss": 1.2506, + "loss": 0.1905, "step": 3950 }, { "epoch": 19.41747572815534, - "grad_norm": 10.601967811584473, + "grad_norm": 7.470388889312744, "learning_rate": 1.7645631067961167e-05, - "loss": 1.2276, + "loss": 0.1846, "step": 4000 }, { "epoch": 19.660194174757283, - "grad_norm": 10.919422149658203, + "grad_norm": 7.033578395843506, "learning_rate": 1.7241100323624594e-05, - "loss": 1.2518, + "loss": 0.1911, "step": 4050 }, { "epoch": 19.902912621359224, - "grad_norm": 11.303253173828125, + "grad_norm": 8.110376358032227, "learning_rate": 1.6836569579288027e-05, - "loss": 1.279, + "loss": 0.1954, "step": 4100 }, { "epoch": 20.145631067961165, - "grad_norm": 10.864278793334961, + "grad_norm": 7.335451602935791, "learning_rate": 1.6432038834951457e-05, - "loss": 1.1686, + "loss": 0.1619, "step": 4150 }, { "epoch": 20.388349514563107, - "grad_norm": 11.179913520812988, + "grad_norm": 7.478648662567139, "learning_rate": 1.6027508090614887e-05, - "loss": 1.1252, + "loss": 0.1497, "step": 4200 }, { "epoch": 20.631067961165048, - "grad_norm": 10.640514373779297, + "grad_norm": 7.764925479888916, "learning_rate": 1.5622977346278317e-05, - "loss": 1.1581, + "loss": 0.1559, "step": 4250 }, { "epoch": 20.87378640776699, - "grad_norm": 11.186044692993164, + "grad_norm": 6.432894706726074, "learning_rate": 1.5218446601941749e-05, - "loss": 1.1782, + "loss": 0.1601, "step": 4300 }, { "epoch": 21.116504854368934, - "grad_norm": 11.105583190917969, + "grad_norm": 7.728542327880859, "learning_rate": 1.4813915857605179e-05, - "loss": 1.0972, + "loss": 0.1416, "step": 4350 }, { "epoch": 21.359223300970875, - "grad_norm": 10.809669494628906, + "grad_norm": 6.370733737945557, "learning_rate": 1.4409385113268609e-05, - "loss": 1.0487, + "loss": 0.1211, "step": 4400 }, { "epoch": 21.601941747572816, - "grad_norm": 11.11020278930664, + "grad_norm": 5.939419746398926, "learning_rate": 1.4004854368932039e-05, - "loss": 1.0566, + "loss": 0.1278, "step": 4450 }, { "epoch": 21.844660194174757, - "grad_norm": 11.062104225158691, + "grad_norm": 6.0013957023620605, "learning_rate": 1.360032362459547e-05, - "loss": 1.078, + "loss": 0.1274, "step": 4500 } ], @@ -657,7 +657,7 @@ "attributes": {} } }, - "total_flos": 587917688832000.0, + "total_flos": 2089590718464000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-4500/training_args.bin b/checkpoints/checkpoint-4500/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-4500/training_args.bin +++ b/checkpoints/checkpoint-4500/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/checkpoints/checkpoint-500/config.json b/checkpoints/checkpoint-500/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-500/config.json +++ b/checkpoints/checkpoint-500/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-500/model.safetensors b/checkpoints/checkpoint-500/model.safetensors index 029e3aef1902aef3ad923b0c4f967ab1fb50eb2a..56c625f0477e783c061bd0cb153974609056fb9a 100644 --- a/checkpoints/checkpoint-500/model.safetensors +++ b/checkpoints/checkpoint-500/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4c1afbb39df3411cf84f25ad19241038d8abaac41edf1c0a677dd67a17d89ed -size 176343496 +oid sha256:3d034447d0bc7b5f683ac7a16c807a7ff485dd3b1b8422db07d941ad747c0e7e +size 617130824 diff --git a/checkpoints/checkpoint-500/optimizer.pt b/checkpoints/checkpoint-500/optimizer.pt index f11b940b531368258dfa735e453c335c10f48d05..e0ec772424bf4b2c204c50e44284e779c1b0389f 100644 --- a/checkpoints/checkpoint-500/optimizer.pt +++ b/checkpoints/checkpoint-500/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8da3e16f41a201489b2992d544d1b99e830f95e73f551e3c0c5458655519c05e -size 352735610 +oid sha256:884a73c849fea23080f3c5b973b293318a6378151eefc4294d005938353e8eb3 +size 1234355130 diff --git a/checkpoints/checkpoint-500/rng_state.pth b/checkpoints/checkpoint-500/rng_state.pth index 04aeea3b556b143b0890ddb31089a99192d6d9b8..7aff725f82359367bdc298fc59231a661ce4c10d 100644 --- a/checkpoints/checkpoint-500/rng_state.pth +++ b/checkpoints/checkpoint-500/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e2380a67a53a9385b5c3c37f5ad0a4990de55596f2382b5db44a545dc24a583 +oid sha256:e2dccfba15ed1aba9d08597eefbae533c8c2d0d42ec26c24724211d605a02862 size 14244 diff --git a/checkpoints/checkpoint-500/trainer_state.json b/checkpoints/checkpoint-500/trainer_state.json index 268bbd23849170c62b806d958f86481e99982880..e8a98f5dc3c1e71c0f09daae1f3926505a2f22f0 100644 --- a/checkpoints/checkpoint-500/trainer_state.json +++ b/checkpoints/checkpoint-500/trainer_state.json @@ -11,72 +11,72 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 } ], @@ -97,7 +97,7 @@ "attributes": {} } }, - "total_flos": 65324187648000.0, + "total_flos": 232176746496000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-500/training_args.bin b/checkpoints/checkpoint-500/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-500/training_args.bin +++ b/checkpoints/checkpoint-500/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/checkpoints/checkpoint-5000/config.json b/checkpoints/checkpoint-5000/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-5000/config.json +++ b/checkpoints/checkpoint-5000/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-5000/model.safetensors b/checkpoints/checkpoint-5000/model.safetensors index 01511287f2195b3b3d9b90534d2d40fa6a9608e3..56113ce294f6f0b6ee3c5284513a136e1758ca88 100644 --- a/checkpoints/checkpoint-5000/model.safetensors +++ b/checkpoints/checkpoint-5000/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cca1bc49b67480ffa73ef99d3dd4267ece3cd384aa12183c64d2d5627624ab95 -size 176343496 +oid sha256:7fb0c09440291d72591c2348bbb62c8d4ac2d16465042cda58e12d3f37ece975 +size 617130824 diff --git a/checkpoints/checkpoint-5000/optimizer.pt b/checkpoints/checkpoint-5000/optimizer.pt index 626c24ec63ad8f1d8cbf5edf8c8bf22bc579984d..317a41c67651eb9db6c6f59fa819fc3f04630d96 100644 --- a/checkpoints/checkpoint-5000/optimizer.pt +++ b/checkpoints/checkpoint-5000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2dde814f5f4a6369ec42b5837437fea636742525202277486f84437ae7a67cb -size 352735610 +oid sha256:e2ad17665791e30430a135fe900c080592800c93238783a5945df6e7bba0856e +size 1234355130 diff --git a/checkpoints/checkpoint-5000/rng_state.pth b/checkpoints/checkpoint-5000/rng_state.pth index 61392e7fccb47ebec30f53a3db9349a159c23fbc..aea3b95b4fd442dbb29665f2e20444892e28e0e3 100644 --- a/checkpoints/checkpoint-5000/rng_state.pth +++ b/checkpoints/checkpoint-5000/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62ccbc8957bbdef56df98ebe9f4c6cc934f917617f416d994f1372b511ea2247 +oid sha256:701e1322fd808bf12cc423711fcadc2ce84c2479cd38c2135bee4c55273578a6 size 14244 diff --git a/checkpoints/checkpoint-5000/trainer_state.json b/checkpoints/checkpoint-5000/trainer_state.json index a0a9eb3229dde3b86ea61b5ef1e522b4caf25c3a..92d309cd00c2526cfdbbb18a811c4ec40e3071cc 100644 --- a/checkpoints/checkpoint-5000/trainer_state.json +++ b/checkpoints/checkpoint-5000/trainer_state.json @@ -11,702 +11,702 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 }, { "epoch": 2.6699029126213594, - "grad_norm": 5.35228157043457, + "grad_norm": 5.231846809387207, "learning_rate": 4.555825242718447e-05, - "loss": 4.1612, + "loss": 4.0662, "step": 550 }, { "epoch": 2.912621359223301, - "grad_norm": 5.287161827087402, + "grad_norm": 5.506278038024902, "learning_rate": 4.51537216828479e-05, - "loss": 4.1322, + "loss": 4.0403, "step": 600 }, { "epoch": 3.1553398058252426, - "grad_norm": 5.2932209968566895, + "grad_norm": 5.410216331481934, "learning_rate": 4.4749190938511334e-05, - "loss": 3.9684, + "loss": 3.8375, "step": 650 }, { "epoch": 3.3980582524271843, - "grad_norm": 5.4360833168029785, + "grad_norm": 5.637699127197266, "learning_rate": 4.434466019417476e-05, - "loss": 3.8933, + "loss": 3.7473, "step": 700 }, { "epoch": 3.6407766990291264, - "grad_norm": 5.5595221519470215, + "grad_norm": 5.575273513793945, "learning_rate": 4.3940129449838194e-05, - "loss": 3.9306, + "loss": 3.7835, "step": 750 }, { "epoch": 3.883495145631068, - "grad_norm": 5.540086269378662, + "grad_norm": 5.737198352813721, "learning_rate": 4.353559870550162e-05, - "loss": 3.9317, + "loss": 3.7919, "step": 800 }, { "epoch": 4.12621359223301, - "grad_norm": 5.567137718200684, + "grad_norm": 5.645532608032227, "learning_rate": 4.313106796116505e-05, - "loss": 3.8135, + "loss": 3.6425, "step": 850 }, { "epoch": 4.368932038834951, - "grad_norm": 6.00346040725708, + "grad_norm": 6.308927059173584, "learning_rate": 4.272653721682848e-05, - "loss": 3.6924, + "loss": 3.4785, "step": 900 }, { "epoch": 4.611650485436893, - "grad_norm": 5.635782718658447, + "grad_norm": 5.863094329833984, "learning_rate": 4.232200647249191e-05, - "loss": 3.7304, + "loss": 3.5268, "step": 950 }, { "epoch": 4.854368932038835, - "grad_norm": 5.9545440673828125, + "grad_norm": 6.120258331298828, "learning_rate": 4.191747572815534e-05, - "loss": 3.7471, + "loss": 3.5296, "step": 1000 }, { "epoch": 5.097087378640777, - "grad_norm": 5.696166038513184, + "grad_norm": 6.231515884399414, "learning_rate": 4.1512944983818774e-05, - "loss": 3.6287, + "loss": 3.3802, "step": 1050 }, { "epoch": 5.339805825242719, - "grad_norm": 5.844620704650879, + "grad_norm": 6.211188316345215, "learning_rate": 4.11084142394822e-05, - "loss": 3.5299, + "loss": 3.2148, "step": 1100 }, { "epoch": 5.58252427184466, - "grad_norm": 6.373577117919922, + "grad_norm": 6.8676934242248535, "learning_rate": 4.0703883495145634e-05, - "loss": 3.5285, + "loss": 3.2111, "step": 1150 }, { "epoch": 5.825242718446602, - "grad_norm": 6.488651275634766, + "grad_norm": 6.851133346557617, "learning_rate": 4.029935275080906e-05, - "loss": 3.5789, + "loss": 3.2771, "step": 1200 }, { "epoch": 6.067961165048544, - "grad_norm": 6.170980930328369, + "grad_norm": 6.708765506744385, "learning_rate": 3.9894822006472494e-05, - "loss": 3.494, + "loss": 3.1587, "step": 1250 }, { "epoch": 6.310679611650485, - "grad_norm": 6.387782573699951, + "grad_norm": 7.136553764343262, "learning_rate": 3.949029126213593e-05, - "loss": 3.353, + "loss": 2.9192, "step": 1300 }, { "epoch": 6.553398058252427, - "grad_norm": 6.718175888061523, + "grad_norm": 7.370823383331299, "learning_rate": 3.9085760517799354e-05, - "loss": 3.3521, + "loss": 2.922, "step": 1350 }, { "epoch": 6.796116504854369, - "grad_norm": 6.7870025634765625, + "grad_norm": 7.630361557006836, "learning_rate": 3.868122977346279e-05, - "loss": 3.3658, + "loss": 2.9417, "step": 1400 }, { "epoch": 7.038834951456311, - "grad_norm": 6.513721466064453, + "grad_norm": 7.765511989593506, "learning_rate": 3.827669902912622e-05, - "loss": 3.3301, + "loss": 2.8669, "step": 1450 }, { "epoch": 7.281553398058253, - "grad_norm": 7.0680036544799805, + "grad_norm": 7.85439920425415, "learning_rate": 3.787216828478965e-05, - "loss": 3.1289, + "loss": 2.5412, "step": 1500 }, { "epoch": 7.524271844660194, - "grad_norm": 7.019896507263184, + "grad_norm": 8.064071655273438, "learning_rate": 3.746763754045307e-05, - "loss": 3.1878, + "loss": 2.6131, "step": 1550 }, { "epoch": 7.766990291262136, - "grad_norm": 7.3232197761535645, + "grad_norm": 8.692522048950195, "learning_rate": 3.7063106796116507e-05, - "loss": 3.2006, + "loss": 2.6356, "step": 1600 }, { "epoch": 8.009708737864077, - "grad_norm": 7.003912925720215, + "grad_norm": 8.012410163879395, "learning_rate": 3.665857605177993e-05, - "loss": 3.1737, + "loss": 2.599, "step": 1650 }, { "epoch": 8.25242718446602, - "grad_norm": 7.592433929443359, + "grad_norm": 8.6799898147583, "learning_rate": 3.6254045307443366e-05, - "loss": 2.9258, + "loss": 2.1992, "step": 1700 }, { "epoch": 8.495145631067961, - "grad_norm": 7.629278182983398, + "grad_norm": 8.963507652282715, "learning_rate": 3.584951456310679e-05, - "loss": 2.9999, + "loss": 2.2701, "step": 1750 }, { "epoch": 8.737864077669903, - "grad_norm": 8.137636184692383, + "grad_norm": 9.833822250366211, "learning_rate": 3.5444983818770226e-05, - "loss": 3.0155, + "loss": 2.2837, "step": 1800 }, { "epoch": 8.980582524271846, - "grad_norm": 7.967227935791016, + "grad_norm": 9.47242259979248, "learning_rate": 3.504045307443366e-05, - "loss": 2.9945, + "loss": 2.2854, "step": 1850 }, { "epoch": 9.223300970873787, - "grad_norm": 7.9910078048706055, + "grad_norm": 8.856402397155762, "learning_rate": 3.4635922330097086e-05, - "loss": 2.7456, + "loss": 1.8862, "step": 1900 }, { "epoch": 9.466019417475728, - "grad_norm": 8.552720069885254, + "grad_norm": 9.948341369628906, "learning_rate": 3.423139158576052e-05, - "loss": 2.8071, + "loss": 1.922, "step": 1950 }, { "epoch": 9.70873786407767, - "grad_norm": 8.336930274963379, + "grad_norm": 10.28939151763916, "learning_rate": 3.382686084142395e-05, - "loss": 2.8117, + "loss": 1.9622, "step": 2000 }, { "epoch": 9.951456310679612, - "grad_norm": 8.636338233947754, + "grad_norm": 10.114766120910645, "learning_rate": 3.342233009708738e-05, - "loss": 2.8197, + "loss": 1.9741, "step": 2050 }, { "epoch": 10.194174757281553, - "grad_norm": 7.955165863037109, + "grad_norm": 9.489270210266113, "learning_rate": 3.301779935275081e-05, - "loss": 2.6245, + "loss": 1.6509, "step": 2100 }, { "epoch": 10.436893203883495, - "grad_norm": 8.679149627685547, + "grad_norm": 10.16482162475586, "learning_rate": 3.2613268608414246e-05, - "loss": 2.565, + "loss": 1.5883, "step": 2150 }, { "epoch": 10.679611650485437, - "grad_norm": 9.150839805603027, + "grad_norm": 11.778059005737305, "learning_rate": 3.220873786407767e-05, - "loss": 2.6258, + "loss": 1.6234, "step": 2200 }, { "epoch": 10.922330097087379, - "grad_norm": 9.2735013961792, + "grad_norm": 10.798454284667969, "learning_rate": 3.1804207119741106e-05, - "loss": 2.6439, + "loss": 1.667, "step": 2250 }, { "epoch": 11.16504854368932, - "grad_norm": 8.620939254760742, + "grad_norm": 9.751585006713867, "learning_rate": 3.139967637540453e-05, - "loss": 2.4371, + "loss": 1.3785, "step": 2300 }, { "epoch": 11.407766990291263, - "grad_norm": 9.722040176391602, + "grad_norm": 10.266364097595215, "learning_rate": 3.099514563106796e-05, - "loss": 2.4074, + "loss": 1.2956, "step": 2350 }, { "epoch": 11.650485436893204, - "grad_norm": 9.200730323791504, + "grad_norm": 10.660856246948242, "learning_rate": 3.059061488673139e-05, - "loss": 2.4307, + "loss": 1.3321, "step": 2400 }, { "epoch": 11.893203883495145, - "grad_norm": 9.692273139953613, + "grad_norm": 10.896187782287598, "learning_rate": 3.0186084142394822e-05, - "loss": 2.424, + "loss": 1.3533, "step": 2450 }, { "epoch": 12.135922330097088, - "grad_norm": 9.822249412536621, + "grad_norm": 10.952502250671387, "learning_rate": 2.9781553398058252e-05, - "loss": 2.2862, + "loss": 1.1486, "step": 2500 }, { "epoch": 12.37864077669903, - "grad_norm": 9.919060707092285, + "grad_norm": 10.766463279724121, "learning_rate": 2.9377022653721686e-05, - "loss": 2.1913, + "loss": 1.0227, "step": 2550 }, { "epoch": 12.62135922330097, - "grad_norm": 9.181950569152832, + "grad_norm": 9.385764122009277, "learning_rate": 2.8972491909385112e-05, - "loss": 2.241, + "loss": 1.0651, "step": 2600 }, { "epoch": 12.864077669902912, - "grad_norm": 10.82077407836914, + "grad_norm": 11.573925018310547, "learning_rate": 2.8567961165048546e-05, - "loss": 2.2653, + "loss": 1.1013, "step": 2650 }, { "epoch": 13.106796116504855, - "grad_norm": 10.365830421447754, + "grad_norm": 10.138091087341309, "learning_rate": 2.816343042071198e-05, - "loss": 2.1352, + "loss": 0.9545, "step": 2700 }, { "epoch": 13.349514563106796, - "grad_norm": 10.073640823364258, + "grad_norm": 9.867119789123535, "learning_rate": 2.7758899676375405e-05, - "loss": 2.0227, + "loss": 0.7892, "step": 2750 }, { "epoch": 13.592233009708737, - "grad_norm": 10.520242691040039, + "grad_norm": 10.443971633911133, "learning_rate": 2.735436893203884e-05, - "loss": 2.0579, + "loss": 0.8432, "step": 2800 }, { "epoch": 13.83495145631068, - "grad_norm": 11.176478385925293, + "grad_norm": 10.633298873901367, "learning_rate": 2.6949838187702265e-05, - "loss": 2.0934, + "loss": 0.8543, "step": 2850 }, { "epoch": 14.077669902912621, - "grad_norm": 9.541868209838867, + "grad_norm": 8.640830039978027, "learning_rate": 2.6545307443365695e-05, - "loss": 2.0118, + "loss": 0.7751, "step": 2900 }, { "epoch": 14.320388349514563, - "grad_norm": 10.162981986999512, + "grad_norm": 10.461247444152832, "learning_rate": 2.614077669902913e-05, - "loss": 1.8624, + "loss": 0.6074, "step": 2950 }, { "epoch": 14.563106796116505, - "grad_norm": 10.51291275024414, + "grad_norm": 10.757479667663574, "learning_rate": 2.5736245954692555e-05, - "loss": 1.8979, + "loss": 0.6354, "step": 3000 }, { "epoch": 14.805825242718447, - "grad_norm": 11.316717147827148, + "grad_norm": 11.599132537841797, "learning_rate": 2.533171521035599e-05, - "loss": 1.915, + "loss": 0.6694, "step": 3050 }, { "epoch": 15.048543689320388, - "grad_norm": 9.184353828430176, + "grad_norm": 8.085358619689941, "learning_rate": 2.492718446601942e-05, - "loss": 1.885, + "loss": 0.6306, "step": 3100 }, { "epoch": 15.29126213592233, - "grad_norm": 10.70876693725586, + "grad_norm": 8.327988624572754, "learning_rate": 2.452265372168285e-05, - "loss": 1.7024, + "loss": 0.4628, "step": 3150 }, { "epoch": 15.533980582524272, - "grad_norm": 10.767953872680664, + "grad_norm": 8.545391082763672, "learning_rate": 2.411812297734628e-05, - "loss": 1.729, + "loss": 0.4915, "step": 3200 }, { "epoch": 15.776699029126213, - "grad_norm": 11.83687973022461, + "grad_norm": 10.176375389099121, "learning_rate": 2.3713592233009708e-05, - "loss": 1.7697, + "loss": 0.5105, "step": 3250 }, { "epoch": 16.019417475728154, - "grad_norm": 9.888689994812012, + "grad_norm": 7.821159362792969, "learning_rate": 2.3309061488673138e-05, - "loss": 1.7605, + "loss": 0.5055, "step": 3300 }, { "epoch": 16.262135922330096, - "grad_norm": 9.394012451171875, + "grad_norm": 7.797800064086914, "learning_rate": 2.290453074433657e-05, - "loss": 1.5389, + "loss": 0.3486, "step": 3350 }, { "epoch": 16.50485436893204, - "grad_norm": 11.3172025680542, + "grad_norm": 8.497370719909668, "learning_rate": 2.25e-05, - "loss": 1.5836, + "loss": 0.3627, "step": 3400 }, { "epoch": 16.74757281553398, - "grad_norm": 11.000115394592285, + "grad_norm": 9.842249870300293, "learning_rate": 2.209546925566343e-05, - "loss": 1.6398, + "loss": 0.3969, "step": 3450 }, { "epoch": 16.990291262135923, - "grad_norm": 11.453404426574707, + "grad_norm": 10.705739974975586, "learning_rate": 2.169093851132686e-05, - "loss": 1.6466, + "loss": 0.4028, "step": 3500 }, { "epoch": 17.233009708737864, - "grad_norm": 11.012279510498047, + "grad_norm": 7.950681686401367, "learning_rate": 2.1286407766990295e-05, - "loss": 1.4434, + "loss": 0.284, "step": 3550 }, { "epoch": 17.475728155339805, - "grad_norm": 11.989383697509766, + "grad_norm": 8.526535034179688, "learning_rate": 2.0881877022653725e-05, - "loss": 1.4708, + "loss": 0.2934, "step": 3600 }, { "epoch": 17.718446601941746, - "grad_norm": 12.47008991241455, + "grad_norm": 8.999290466308594, "learning_rate": 2.047734627831715e-05, - "loss": 1.4783, + "loss": 0.2964, "step": 3650 }, { "epoch": 17.96116504854369, - "grad_norm": 10.983756065368652, + "grad_norm": 9.228137969970703, "learning_rate": 2.007281553398058e-05, - "loss": 1.4839, + "loss": 0.311, "step": 3700 }, { "epoch": 18.203883495145632, - "grad_norm": 9.582510948181152, + "grad_norm": 6.715802192687988, "learning_rate": 1.9668284789644014e-05, - "loss": 1.3226, + "loss": 0.2263, "step": 3750 }, { "epoch": 18.446601941747574, - "grad_norm": 11.663228034973145, + "grad_norm": 7.834264278411865, "learning_rate": 1.9263754045307444e-05, - "loss": 1.3522, + "loss": 0.2293, "step": 3800 }, { "epoch": 18.689320388349515, - "grad_norm": 11.278139114379883, + "grad_norm": 6.83228874206543, "learning_rate": 1.8859223300970874e-05, - "loss": 1.3753, + "loss": 0.2392, "step": 3850 }, { "epoch": 18.932038834951456, - "grad_norm": 10.586488723754883, + "grad_norm": 8.507209777832031, "learning_rate": 1.8454692556634304e-05, - "loss": 1.3774, + "loss": 0.2377, "step": 3900 }, { "epoch": 19.174757281553397, - "grad_norm": 11.243274688720703, + "grad_norm": 6.565855026245117, "learning_rate": 1.8050161812297738e-05, - "loss": 1.2506, + "loss": 0.1905, "step": 3950 }, { "epoch": 19.41747572815534, - "grad_norm": 10.601967811584473, + "grad_norm": 7.470388889312744, "learning_rate": 1.7645631067961167e-05, - "loss": 1.2276, + "loss": 0.1846, "step": 4000 }, { "epoch": 19.660194174757283, - "grad_norm": 10.919422149658203, + "grad_norm": 7.033578395843506, "learning_rate": 1.7241100323624594e-05, - "loss": 1.2518, + "loss": 0.1911, "step": 4050 }, { "epoch": 19.902912621359224, - "grad_norm": 11.303253173828125, + "grad_norm": 8.110376358032227, "learning_rate": 1.6836569579288027e-05, - "loss": 1.279, + "loss": 0.1954, "step": 4100 }, { "epoch": 20.145631067961165, - "grad_norm": 10.864278793334961, + "grad_norm": 7.335451602935791, "learning_rate": 1.6432038834951457e-05, - "loss": 1.1686, + "loss": 0.1619, "step": 4150 }, { "epoch": 20.388349514563107, - "grad_norm": 11.179913520812988, + "grad_norm": 7.478648662567139, "learning_rate": 1.6027508090614887e-05, - "loss": 1.1252, + "loss": 0.1497, "step": 4200 }, { "epoch": 20.631067961165048, - "grad_norm": 10.640514373779297, + "grad_norm": 7.764925479888916, "learning_rate": 1.5622977346278317e-05, - "loss": 1.1581, + "loss": 0.1559, "step": 4250 }, { "epoch": 20.87378640776699, - "grad_norm": 11.186044692993164, + "grad_norm": 6.432894706726074, "learning_rate": 1.5218446601941749e-05, - "loss": 1.1782, + "loss": 0.1601, "step": 4300 }, { "epoch": 21.116504854368934, - "grad_norm": 11.105583190917969, + "grad_norm": 7.728542327880859, "learning_rate": 1.4813915857605179e-05, - "loss": 1.0972, + "loss": 0.1416, "step": 4350 }, { "epoch": 21.359223300970875, - "grad_norm": 10.809669494628906, + "grad_norm": 6.370733737945557, "learning_rate": 1.4409385113268609e-05, - "loss": 1.0487, + "loss": 0.1211, "step": 4400 }, { "epoch": 21.601941747572816, - "grad_norm": 11.11020278930664, + "grad_norm": 5.939419746398926, "learning_rate": 1.4004854368932039e-05, - "loss": 1.0566, + "loss": 0.1278, "step": 4450 }, { "epoch": 21.844660194174757, - "grad_norm": 11.062104225158691, + "grad_norm": 6.0013957023620605, "learning_rate": 1.360032362459547e-05, - "loss": 1.078, + "loss": 0.1274, "step": 4500 }, { "epoch": 22.0873786407767, - "grad_norm": 10.393362998962402, + "grad_norm": 6.1365966796875, "learning_rate": 1.31957928802589e-05, - "loss": 1.0351, + "loss": 0.1205, "step": 4550 }, { "epoch": 22.33009708737864, - "grad_norm": 9.539032936096191, + "grad_norm": 5.679115295410156, "learning_rate": 1.279126213592233e-05, - "loss": 0.9655, + "loss": 0.106, "step": 4600 }, { "epoch": 22.57281553398058, - "grad_norm": 10.375469207763672, + "grad_norm": 4.750304698944092, "learning_rate": 1.2386731391585762e-05, - "loss": 0.9857, + "loss": 0.108, "step": 4650 }, { "epoch": 22.815533980582526, - "grad_norm": 11.338729858398438, + "grad_norm": 6.115650653839111, "learning_rate": 1.1982200647249192e-05, - "loss": 0.9975, + "loss": 0.1086, "step": 4700 }, { "epoch": 23.058252427184467, - "grad_norm": 10.976126670837402, + "grad_norm": 5.252938270568848, "learning_rate": 1.1577669902912622e-05, - "loss": 0.9785, + "loss": 0.1041, "step": 4750 }, { "epoch": 23.300970873786408, - "grad_norm": 11.16224193572998, + "grad_norm": 5.175116062164307, "learning_rate": 1.1173139158576053e-05, - "loss": 0.9011, + "loss": 0.0872, "step": 4800 }, { "epoch": 23.54368932038835, - "grad_norm": 11.640942573547363, + "grad_norm": 5.768143653869629, "learning_rate": 1.0768608414239483e-05, - "loss": 0.9056, + "loss": 0.0945, "step": 4850 }, { "epoch": 23.78640776699029, - "grad_norm": 10.686304092407227, + "grad_norm": 5.235666275024414, "learning_rate": 1.0364077669902913e-05, - "loss": 0.9267, + "loss": 0.0969, "step": 4900 }, { "epoch": 24.02912621359223, - "grad_norm": 9.798011779785156, + "grad_norm": 4.407253265380859, "learning_rate": 9.959546925566343e-06, - "loss": 0.9134, + "loss": 0.0933, "step": 4950 }, { "epoch": 24.271844660194176, - "grad_norm": 10.638201713562012, + "grad_norm": 5.28978157043457, "learning_rate": 9.555016181229775e-06, - "loss": 0.8257, + "loss": 0.0766, "step": 5000 } ], @@ -727,7 +727,7 @@ "attributes": {} } }, - "total_flos": 653241876480000.0, + "total_flos": 2321767464960000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-5000/training_args.bin b/checkpoints/checkpoint-5000/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-5000/training_args.bin +++ b/checkpoints/checkpoint-5000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/checkpoints/checkpoint-5500/config.json b/checkpoints/checkpoint-5500/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-5500/config.json +++ b/checkpoints/checkpoint-5500/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-5500/model.safetensors b/checkpoints/checkpoint-5500/model.safetensors index 738b877d9897a0c05724da0ae5acccfc58961f53..d1f7b10f382e7eda7373000faf2ac2cdc532ca9c 100644 --- a/checkpoints/checkpoint-5500/model.safetensors +++ b/checkpoints/checkpoint-5500/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27205eee4af3eae6a45d2039bd34f1f181ca0c93cd18e71f2955c54ea4292fb1 -size 176343496 +oid sha256:174a765237472aaa691df21b25722e19fa5ed756d63c4e25493870b81d636beb +size 617130824 diff --git a/checkpoints/checkpoint-5500/optimizer.pt b/checkpoints/checkpoint-5500/optimizer.pt index c198307eee689d6eaac54fe156359b82de1a5987..b5afd3d10bc576ea0cbcbc630aff5c25ac353283 100644 --- a/checkpoints/checkpoint-5500/optimizer.pt +++ b/checkpoints/checkpoint-5500/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4b6f591963c2ef749fb8b9188470253909aaec8a2e4dd7fcc84af2d9ac5ce94 -size 352735610 +oid sha256:b4aea01fc5ce71560b17b6b806ea6049d611f7ca1c34a53beb6d62ecda694a1f +size 1234355130 diff --git a/checkpoints/checkpoint-5500/rng_state.pth b/checkpoints/checkpoint-5500/rng_state.pth index 76d1ab002c8b8bf0afbfbab0c654ca9719bafb4b..a16bc0187c6c425784fae5e6e570cf0376242199 100644 --- a/checkpoints/checkpoint-5500/rng_state.pth +++ b/checkpoints/checkpoint-5500/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac024d41359d5069c6b5deaf750dedad487ffc03a6c12bbcb8eb368c22272ff0 +oid sha256:a5690a7b49339c64ee916755c1f9baac58f9b80ceec80935020a42af7777ee1b size 14244 diff --git a/checkpoints/checkpoint-5500/trainer_state.json b/checkpoints/checkpoint-5500/trainer_state.json index 3fe34e6e0cca9453a604563d75a00d5075ed4d89..a9fac5617f4edf1ef68832857a58c18e91b8d6d7 100644 --- a/checkpoints/checkpoint-5500/trainer_state.json +++ b/checkpoints/checkpoint-5500/trainer_state.json @@ -11,772 +11,772 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 }, { "epoch": 2.6699029126213594, - "grad_norm": 5.35228157043457, + "grad_norm": 5.231846809387207, "learning_rate": 4.555825242718447e-05, - "loss": 4.1612, + "loss": 4.0662, "step": 550 }, { "epoch": 2.912621359223301, - "grad_norm": 5.287161827087402, + "grad_norm": 5.506278038024902, "learning_rate": 4.51537216828479e-05, - "loss": 4.1322, + "loss": 4.0403, "step": 600 }, { "epoch": 3.1553398058252426, - "grad_norm": 5.2932209968566895, + "grad_norm": 5.410216331481934, "learning_rate": 4.4749190938511334e-05, - "loss": 3.9684, + "loss": 3.8375, "step": 650 }, { "epoch": 3.3980582524271843, - "grad_norm": 5.4360833168029785, + "grad_norm": 5.637699127197266, "learning_rate": 4.434466019417476e-05, - "loss": 3.8933, + "loss": 3.7473, "step": 700 }, { "epoch": 3.6407766990291264, - "grad_norm": 5.5595221519470215, + "grad_norm": 5.575273513793945, "learning_rate": 4.3940129449838194e-05, - "loss": 3.9306, + "loss": 3.7835, "step": 750 }, { "epoch": 3.883495145631068, - "grad_norm": 5.540086269378662, + "grad_norm": 5.737198352813721, "learning_rate": 4.353559870550162e-05, - "loss": 3.9317, + "loss": 3.7919, "step": 800 }, { "epoch": 4.12621359223301, - "grad_norm": 5.567137718200684, + "grad_norm": 5.645532608032227, "learning_rate": 4.313106796116505e-05, - "loss": 3.8135, + "loss": 3.6425, "step": 850 }, { "epoch": 4.368932038834951, - "grad_norm": 6.00346040725708, + "grad_norm": 6.308927059173584, "learning_rate": 4.272653721682848e-05, - "loss": 3.6924, + "loss": 3.4785, "step": 900 }, { "epoch": 4.611650485436893, - "grad_norm": 5.635782718658447, + "grad_norm": 5.863094329833984, "learning_rate": 4.232200647249191e-05, - "loss": 3.7304, + "loss": 3.5268, "step": 950 }, { "epoch": 4.854368932038835, - "grad_norm": 5.9545440673828125, + "grad_norm": 6.120258331298828, "learning_rate": 4.191747572815534e-05, - "loss": 3.7471, + "loss": 3.5296, "step": 1000 }, { "epoch": 5.097087378640777, - "grad_norm": 5.696166038513184, + "grad_norm": 6.231515884399414, "learning_rate": 4.1512944983818774e-05, - "loss": 3.6287, + "loss": 3.3802, "step": 1050 }, { "epoch": 5.339805825242719, - "grad_norm": 5.844620704650879, + "grad_norm": 6.211188316345215, "learning_rate": 4.11084142394822e-05, - "loss": 3.5299, + "loss": 3.2148, "step": 1100 }, { "epoch": 5.58252427184466, - "grad_norm": 6.373577117919922, + "grad_norm": 6.8676934242248535, "learning_rate": 4.0703883495145634e-05, - "loss": 3.5285, + "loss": 3.2111, "step": 1150 }, { "epoch": 5.825242718446602, - "grad_norm": 6.488651275634766, + "grad_norm": 6.851133346557617, "learning_rate": 4.029935275080906e-05, - "loss": 3.5789, + "loss": 3.2771, "step": 1200 }, { "epoch": 6.067961165048544, - "grad_norm": 6.170980930328369, + "grad_norm": 6.708765506744385, "learning_rate": 3.9894822006472494e-05, - "loss": 3.494, + "loss": 3.1587, "step": 1250 }, { "epoch": 6.310679611650485, - "grad_norm": 6.387782573699951, + "grad_norm": 7.136553764343262, "learning_rate": 3.949029126213593e-05, - "loss": 3.353, + "loss": 2.9192, "step": 1300 }, { "epoch": 6.553398058252427, - "grad_norm": 6.718175888061523, + "grad_norm": 7.370823383331299, "learning_rate": 3.9085760517799354e-05, - "loss": 3.3521, + "loss": 2.922, "step": 1350 }, { "epoch": 6.796116504854369, - "grad_norm": 6.7870025634765625, + "grad_norm": 7.630361557006836, "learning_rate": 3.868122977346279e-05, - "loss": 3.3658, + "loss": 2.9417, "step": 1400 }, { "epoch": 7.038834951456311, - "grad_norm": 6.513721466064453, + "grad_norm": 7.765511989593506, "learning_rate": 3.827669902912622e-05, - "loss": 3.3301, + "loss": 2.8669, "step": 1450 }, { "epoch": 7.281553398058253, - "grad_norm": 7.0680036544799805, + "grad_norm": 7.85439920425415, "learning_rate": 3.787216828478965e-05, - "loss": 3.1289, + "loss": 2.5412, "step": 1500 }, { "epoch": 7.524271844660194, - "grad_norm": 7.019896507263184, + "grad_norm": 8.064071655273438, "learning_rate": 3.746763754045307e-05, - "loss": 3.1878, + "loss": 2.6131, "step": 1550 }, { "epoch": 7.766990291262136, - "grad_norm": 7.3232197761535645, + "grad_norm": 8.692522048950195, "learning_rate": 3.7063106796116507e-05, - "loss": 3.2006, + "loss": 2.6356, "step": 1600 }, { "epoch": 8.009708737864077, - "grad_norm": 7.003912925720215, + "grad_norm": 8.012410163879395, "learning_rate": 3.665857605177993e-05, - "loss": 3.1737, + "loss": 2.599, "step": 1650 }, { "epoch": 8.25242718446602, - "grad_norm": 7.592433929443359, + "grad_norm": 8.6799898147583, "learning_rate": 3.6254045307443366e-05, - "loss": 2.9258, + "loss": 2.1992, "step": 1700 }, { "epoch": 8.495145631067961, - "grad_norm": 7.629278182983398, + "grad_norm": 8.963507652282715, "learning_rate": 3.584951456310679e-05, - "loss": 2.9999, + "loss": 2.2701, "step": 1750 }, { "epoch": 8.737864077669903, - "grad_norm": 8.137636184692383, + "grad_norm": 9.833822250366211, "learning_rate": 3.5444983818770226e-05, - "loss": 3.0155, + "loss": 2.2837, "step": 1800 }, { "epoch": 8.980582524271846, - "grad_norm": 7.967227935791016, + "grad_norm": 9.47242259979248, "learning_rate": 3.504045307443366e-05, - "loss": 2.9945, + "loss": 2.2854, "step": 1850 }, { "epoch": 9.223300970873787, - "grad_norm": 7.9910078048706055, + "grad_norm": 8.856402397155762, "learning_rate": 3.4635922330097086e-05, - "loss": 2.7456, + "loss": 1.8862, "step": 1900 }, { "epoch": 9.466019417475728, - "grad_norm": 8.552720069885254, + "grad_norm": 9.948341369628906, "learning_rate": 3.423139158576052e-05, - "loss": 2.8071, + "loss": 1.922, "step": 1950 }, { "epoch": 9.70873786407767, - "grad_norm": 8.336930274963379, + "grad_norm": 10.28939151763916, "learning_rate": 3.382686084142395e-05, - "loss": 2.8117, + "loss": 1.9622, "step": 2000 }, { "epoch": 9.951456310679612, - "grad_norm": 8.636338233947754, + "grad_norm": 10.114766120910645, "learning_rate": 3.342233009708738e-05, - "loss": 2.8197, + "loss": 1.9741, "step": 2050 }, { "epoch": 10.194174757281553, - "grad_norm": 7.955165863037109, + "grad_norm": 9.489270210266113, "learning_rate": 3.301779935275081e-05, - "loss": 2.6245, + "loss": 1.6509, "step": 2100 }, { "epoch": 10.436893203883495, - "grad_norm": 8.679149627685547, + "grad_norm": 10.16482162475586, "learning_rate": 3.2613268608414246e-05, - "loss": 2.565, + "loss": 1.5883, "step": 2150 }, { "epoch": 10.679611650485437, - "grad_norm": 9.150839805603027, + "grad_norm": 11.778059005737305, "learning_rate": 3.220873786407767e-05, - "loss": 2.6258, + "loss": 1.6234, "step": 2200 }, { "epoch": 10.922330097087379, - "grad_norm": 9.2735013961792, + "grad_norm": 10.798454284667969, "learning_rate": 3.1804207119741106e-05, - "loss": 2.6439, + "loss": 1.667, "step": 2250 }, { "epoch": 11.16504854368932, - "grad_norm": 8.620939254760742, + "grad_norm": 9.751585006713867, "learning_rate": 3.139967637540453e-05, - "loss": 2.4371, + "loss": 1.3785, "step": 2300 }, { "epoch": 11.407766990291263, - "grad_norm": 9.722040176391602, + "grad_norm": 10.266364097595215, "learning_rate": 3.099514563106796e-05, - "loss": 2.4074, + "loss": 1.2956, "step": 2350 }, { "epoch": 11.650485436893204, - "grad_norm": 9.200730323791504, + "grad_norm": 10.660856246948242, "learning_rate": 3.059061488673139e-05, - "loss": 2.4307, + "loss": 1.3321, "step": 2400 }, { "epoch": 11.893203883495145, - "grad_norm": 9.692273139953613, + "grad_norm": 10.896187782287598, "learning_rate": 3.0186084142394822e-05, - "loss": 2.424, + "loss": 1.3533, "step": 2450 }, { "epoch": 12.135922330097088, - "grad_norm": 9.822249412536621, + "grad_norm": 10.952502250671387, "learning_rate": 2.9781553398058252e-05, - "loss": 2.2862, + "loss": 1.1486, "step": 2500 }, { "epoch": 12.37864077669903, - "grad_norm": 9.919060707092285, + "grad_norm": 10.766463279724121, "learning_rate": 2.9377022653721686e-05, - "loss": 2.1913, + "loss": 1.0227, "step": 2550 }, { "epoch": 12.62135922330097, - "grad_norm": 9.181950569152832, + "grad_norm": 9.385764122009277, "learning_rate": 2.8972491909385112e-05, - "loss": 2.241, + "loss": 1.0651, "step": 2600 }, { "epoch": 12.864077669902912, - "grad_norm": 10.82077407836914, + "grad_norm": 11.573925018310547, "learning_rate": 2.8567961165048546e-05, - "loss": 2.2653, + "loss": 1.1013, "step": 2650 }, { "epoch": 13.106796116504855, - "grad_norm": 10.365830421447754, + "grad_norm": 10.138091087341309, "learning_rate": 2.816343042071198e-05, - "loss": 2.1352, + "loss": 0.9545, "step": 2700 }, { "epoch": 13.349514563106796, - "grad_norm": 10.073640823364258, + "grad_norm": 9.867119789123535, "learning_rate": 2.7758899676375405e-05, - "loss": 2.0227, + "loss": 0.7892, "step": 2750 }, { "epoch": 13.592233009708737, - "grad_norm": 10.520242691040039, + "grad_norm": 10.443971633911133, "learning_rate": 2.735436893203884e-05, - "loss": 2.0579, + "loss": 0.8432, "step": 2800 }, { "epoch": 13.83495145631068, - "grad_norm": 11.176478385925293, + "grad_norm": 10.633298873901367, "learning_rate": 2.6949838187702265e-05, - "loss": 2.0934, + "loss": 0.8543, "step": 2850 }, { "epoch": 14.077669902912621, - "grad_norm": 9.541868209838867, + "grad_norm": 8.640830039978027, "learning_rate": 2.6545307443365695e-05, - "loss": 2.0118, + "loss": 0.7751, "step": 2900 }, { "epoch": 14.320388349514563, - "grad_norm": 10.162981986999512, + "grad_norm": 10.461247444152832, "learning_rate": 2.614077669902913e-05, - "loss": 1.8624, + "loss": 0.6074, "step": 2950 }, { "epoch": 14.563106796116505, - "grad_norm": 10.51291275024414, + "grad_norm": 10.757479667663574, "learning_rate": 2.5736245954692555e-05, - "loss": 1.8979, + "loss": 0.6354, "step": 3000 }, { "epoch": 14.805825242718447, - "grad_norm": 11.316717147827148, + "grad_norm": 11.599132537841797, "learning_rate": 2.533171521035599e-05, - "loss": 1.915, + "loss": 0.6694, "step": 3050 }, { "epoch": 15.048543689320388, - "grad_norm": 9.184353828430176, + "grad_norm": 8.085358619689941, "learning_rate": 2.492718446601942e-05, - "loss": 1.885, + "loss": 0.6306, "step": 3100 }, { "epoch": 15.29126213592233, - "grad_norm": 10.70876693725586, + "grad_norm": 8.327988624572754, "learning_rate": 2.452265372168285e-05, - "loss": 1.7024, + "loss": 0.4628, "step": 3150 }, { "epoch": 15.533980582524272, - "grad_norm": 10.767953872680664, + "grad_norm": 8.545391082763672, "learning_rate": 2.411812297734628e-05, - "loss": 1.729, + "loss": 0.4915, "step": 3200 }, { "epoch": 15.776699029126213, - "grad_norm": 11.83687973022461, + "grad_norm": 10.176375389099121, "learning_rate": 2.3713592233009708e-05, - "loss": 1.7697, + "loss": 0.5105, "step": 3250 }, { "epoch": 16.019417475728154, - "grad_norm": 9.888689994812012, + "grad_norm": 7.821159362792969, "learning_rate": 2.3309061488673138e-05, - "loss": 1.7605, + "loss": 0.5055, "step": 3300 }, { "epoch": 16.262135922330096, - "grad_norm": 9.394012451171875, + "grad_norm": 7.797800064086914, "learning_rate": 2.290453074433657e-05, - "loss": 1.5389, + "loss": 0.3486, "step": 3350 }, { "epoch": 16.50485436893204, - "grad_norm": 11.3172025680542, + "grad_norm": 8.497370719909668, "learning_rate": 2.25e-05, - "loss": 1.5836, + "loss": 0.3627, "step": 3400 }, { "epoch": 16.74757281553398, - "grad_norm": 11.000115394592285, + "grad_norm": 9.842249870300293, "learning_rate": 2.209546925566343e-05, - "loss": 1.6398, + "loss": 0.3969, "step": 3450 }, { "epoch": 16.990291262135923, - "grad_norm": 11.453404426574707, + "grad_norm": 10.705739974975586, "learning_rate": 2.169093851132686e-05, - "loss": 1.6466, + "loss": 0.4028, "step": 3500 }, { "epoch": 17.233009708737864, - "grad_norm": 11.012279510498047, + "grad_norm": 7.950681686401367, "learning_rate": 2.1286407766990295e-05, - "loss": 1.4434, + "loss": 0.284, "step": 3550 }, { "epoch": 17.475728155339805, - "grad_norm": 11.989383697509766, + "grad_norm": 8.526535034179688, "learning_rate": 2.0881877022653725e-05, - "loss": 1.4708, + "loss": 0.2934, "step": 3600 }, { "epoch": 17.718446601941746, - "grad_norm": 12.47008991241455, + "grad_norm": 8.999290466308594, "learning_rate": 2.047734627831715e-05, - "loss": 1.4783, + "loss": 0.2964, "step": 3650 }, { "epoch": 17.96116504854369, - "grad_norm": 10.983756065368652, + "grad_norm": 9.228137969970703, "learning_rate": 2.007281553398058e-05, - "loss": 1.4839, + "loss": 0.311, "step": 3700 }, { "epoch": 18.203883495145632, - "grad_norm": 9.582510948181152, + "grad_norm": 6.715802192687988, "learning_rate": 1.9668284789644014e-05, - "loss": 1.3226, + "loss": 0.2263, "step": 3750 }, { "epoch": 18.446601941747574, - "grad_norm": 11.663228034973145, + "grad_norm": 7.834264278411865, "learning_rate": 1.9263754045307444e-05, - "loss": 1.3522, + "loss": 0.2293, "step": 3800 }, { "epoch": 18.689320388349515, - "grad_norm": 11.278139114379883, + "grad_norm": 6.83228874206543, "learning_rate": 1.8859223300970874e-05, - "loss": 1.3753, + "loss": 0.2392, "step": 3850 }, { "epoch": 18.932038834951456, - "grad_norm": 10.586488723754883, + "grad_norm": 8.507209777832031, "learning_rate": 1.8454692556634304e-05, - "loss": 1.3774, + "loss": 0.2377, "step": 3900 }, { "epoch": 19.174757281553397, - "grad_norm": 11.243274688720703, + "grad_norm": 6.565855026245117, "learning_rate": 1.8050161812297738e-05, - "loss": 1.2506, + "loss": 0.1905, "step": 3950 }, { "epoch": 19.41747572815534, - "grad_norm": 10.601967811584473, + "grad_norm": 7.470388889312744, "learning_rate": 1.7645631067961167e-05, - "loss": 1.2276, + "loss": 0.1846, "step": 4000 }, { "epoch": 19.660194174757283, - "grad_norm": 10.919422149658203, + "grad_norm": 7.033578395843506, "learning_rate": 1.7241100323624594e-05, - "loss": 1.2518, + "loss": 0.1911, "step": 4050 }, { "epoch": 19.902912621359224, - "grad_norm": 11.303253173828125, + "grad_norm": 8.110376358032227, "learning_rate": 1.6836569579288027e-05, - "loss": 1.279, + "loss": 0.1954, "step": 4100 }, { "epoch": 20.145631067961165, - "grad_norm": 10.864278793334961, + "grad_norm": 7.335451602935791, "learning_rate": 1.6432038834951457e-05, - "loss": 1.1686, + "loss": 0.1619, "step": 4150 }, { "epoch": 20.388349514563107, - "grad_norm": 11.179913520812988, + "grad_norm": 7.478648662567139, "learning_rate": 1.6027508090614887e-05, - "loss": 1.1252, + "loss": 0.1497, "step": 4200 }, { "epoch": 20.631067961165048, - "grad_norm": 10.640514373779297, + "grad_norm": 7.764925479888916, "learning_rate": 1.5622977346278317e-05, - "loss": 1.1581, + "loss": 0.1559, "step": 4250 }, { "epoch": 20.87378640776699, - "grad_norm": 11.186044692993164, + "grad_norm": 6.432894706726074, "learning_rate": 1.5218446601941749e-05, - "loss": 1.1782, + "loss": 0.1601, "step": 4300 }, { "epoch": 21.116504854368934, - "grad_norm": 11.105583190917969, + "grad_norm": 7.728542327880859, "learning_rate": 1.4813915857605179e-05, - "loss": 1.0972, + "loss": 0.1416, "step": 4350 }, { "epoch": 21.359223300970875, - "grad_norm": 10.809669494628906, + "grad_norm": 6.370733737945557, "learning_rate": 1.4409385113268609e-05, - "loss": 1.0487, + "loss": 0.1211, "step": 4400 }, { "epoch": 21.601941747572816, - "grad_norm": 11.11020278930664, + "grad_norm": 5.939419746398926, "learning_rate": 1.4004854368932039e-05, - "loss": 1.0566, + "loss": 0.1278, "step": 4450 }, { "epoch": 21.844660194174757, - "grad_norm": 11.062104225158691, + "grad_norm": 6.0013957023620605, "learning_rate": 1.360032362459547e-05, - "loss": 1.078, + "loss": 0.1274, "step": 4500 }, { "epoch": 22.0873786407767, - "grad_norm": 10.393362998962402, + "grad_norm": 6.1365966796875, "learning_rate": 1.31957928802589e-05, - "loss": 1.0351, + "loss": 0.1205, "step": 4550 }, { "epoch": 22.33009708737864, - "grad_norm": 9.539032936096191, + "grad_norm": 5.679115295410156, "learning_rate": 1.279126213592233e-05, - "loss": 0.9655, + "loss": 0.106, "step": 4600 }, { "epoch": 22.57281553398058, - "grad_norm": 10.375469207763672, + "grad_norm": 4.750304698944092, "learning_rate": 1.2386731391585762e-05, - "loss": 0.9857, + "loss": 0.108, "step": 4650 }, { "epoch": 22.815533980582526, - "grad_norm": 11.338729858398438, + "grad_norm": 6.115650653839111, "learning_rate": 1.1982200647249192e-05, - "loss": 0.9975, + "loss": 0.1086, "step": 4700 }, { "epoch": 23.058252427184467, - "grad_norm": 10.976126670837402, + "grad_norm": 5.252938270568848, "learning_rate": 1.1577669902912622e-05, - "loss": 0.9785, + "loss": 0.1041, "step": 4750 }, { "epoch": 23.300970873786408, - "grad_norm": 11.16224193572998, + "grad_norm": 5.175116062164307, "learning_rate": 1.1173139158576053e-05, - "loss": 0.9011, + "loss": 0.0872, "step": 4800 }, { "epoch": 23.54368932038835, - "grad_norm": 11.640942573547363, + "grad_norm": 5.768143653869629, "learning_rate": 1.0768608414239483e-05, - "loss": 0.9056, + "loss": 0.0945, "step": 4850 }, { "epoch": 23.78640776699029, - "grad_norm": 10.686304092407227, + "grad_norm": 5.235666275024414, "learning_rate": 1.0364077669902913e-05, - "loss": 0.9267, + "loss": 0.0969, "step": 4900 }, { "epoch": 24.02912621359223, - "grad_norm": 9.798011779785156, + "grad_norm": 4.407253265380859, "learning_rate": 9.959546925566343e-06, - "loss": 0.9134, + "loss": 0.0933, "step": 4950 }, { "epoch": 24.271844660194176, - "grad_norm": 10.638201713562012, + "grad_norm": 5.28978157043457, "learning_rate": 9.555016181229775e-06, - "loss": 0.8257, + "loss": 0.0766, "step": 5000 }, { "epoch": 24.514563106796118, - "grad_norm": 10.393739700317383, + "grad_norm": 6.059443950653076, "learning_rate": 9.150485436893205e-06, - "loss": 0.8574, + "loss": 0.082, "step": 5050 }, { "epoch": 24.75728155339806, - "grad_norm": 11.262478828430176, + "grad_norm": 4.627708435058594, "learning_rate": 8.745954692556635e-06, - "loss": 0.8645, + "loss": 0.0785, "step": 5100 }, { "epoch": 25.0, - "grad_norm": 12.033185005187988, + "grad_norm": 5.263092041015625, "learning_rate": 8.341423948220065e-06, - "loss": 0.8719, + "loss": 0.0816, "step": 5150 }, { "epoch": 25.24271844660194, - "grad_norm": 10.399738311767578, + "grad_norm": 6.337419509887695, "learning_rate": 7.936893203883496e-06, - "loss": 0.7727, + "loss": 0.0676, "step": 5200 }, { "epoch": 25.485436893203882, - "grad_norm": 9.93704605102539, + "grad_norm": 3.846252202987671, "learning_rate": 7.532362459546925e-06, - "loss": 0.799, + "loss": 0.0699, "step": 5250 }, { "epoch": 25.728155339805824, - "grad_norm": 9.828903198242188, + "grad_norm": 5.770230293273926, "learning_rate": 7.127831715210356e-06, - "loss": 0.8073, + "loss": 0.0673, "step": 5300 }, { "epoch": 25.97087378640777, - "grad_norm": 11.109928131103516, + "grad_norm": 4.10913610458374, "learning_rate": 6.723300970873788e-06, - "loss": 0.8155, + "loss": 0.0706, "step": 5350 }, { "epoch": 26.21359223300971, - "grad_norm": 10.467114448547363, + "grad_norm": 4.198057174682617, "learning_rate": 6.318770226537217e-06, - "loss": 0.7476, + "loss": 0.0615, "step": 5400 }, { "epoch": 26.45631067961165, - "grad_norm": 10.594189643859863, + "grad_norm": 5.369973182678223, "learning_rate": 5.914239482200648e-06, - "loss": 0.7518, + "loss": 0.0585, "step": 5450 }, { "epoch": 26.699029126213592, - "grad_norm": 9.673776626586914, + "grad_norm": 5.3213324546813965, "learning_rate": 5.5097087378640776e-06, - "loss": 0.7584, + "loss": 0.0594, "step": 5500 } ], @@ -797,7 +797,7 @@ "attributes": {} } }, - "total_flos": 718566064128000.0, + "total_flos": 2553944211456000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-5500/training_args.bin b/checkpoints/checkpoint-5500/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-5500/training_args.bin +++ b/checkpoints/checkpoint-5500/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/checkpoints/checkpoint-6000/config.json b/checkpoints/checkpoint-6000/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-6000/config.json +++ b/checkpoints/checkpoint-6000/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-6000/model.safetensors b/checkpoints/checkpoint-6000/model.safetensors index 5da1bd3dcf76a1fa7b3c113ade41c491b8f994fa..d60265c480bbb6228e3ee56788485f24f7d5a36a 100644 --- a/checkpoints/checkpoint-6000/model.safetensors +++ b/checkpoints/checkpoint-6000/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d61b1d8ecb28eb7a68f3d74381094be2b23ae7fae1ddbb447d605e957174342 -size 176343496 +oid sha256:f52f2066dac40e1280cb202f68d8183beeeb55d6109623a81b9a11e71d1a78c7 +size 617130824 diff --git a/checkpoints/checkpoint-6000/optimizer.pt b/checkpoints/checkpoint-6000/optimizer.pt index facc8f0e15affa237084f7fb485f1d9dc996bc21..9da4d13bd392bfabc845f700f919320ac13a4cab 100644 --- a/checkpoints/checkpoint-6000/optimizer.pt +++ b/checkpoints/checkpoint-6000/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37e5d0b4010e8cd292e05beb365a6caf93e26cfb746e237243ef856fa2055400 -size 352735610 +oid sha256:c9e46632a1dfa2221f6d2e4cdc5b385112381572f1fb68fef0443b1328f4be5c +size 1234355130 diff --git a/checkpoints/checkpoint-6000/rng_state.pth b/checkpoints/checkpoint-6000/rng_state.pth index ae89ec44ac2dc0cb6027480d380135b84a8b8314..0bb0b3f6758f728edbc36cf8e60eb249134f1cac 100644 --- a/checkpoints/checkpoint-6000/rng_state.pth +++ b/checkpoints/checkpoint-6000/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a531f647ff978d5bf75b4522198829917cfa163db1951078c889e7146d65dfc +oid sha256:f82c6a71ccfbb018d8f4925e690097b353a0315c2874979733cef79c2c9229f0 size 14244 diff --git a/checkpoints/checkpoint-6000/trainer_state.json b/checkpoints/checkpoint-6000/trainer_state.json index 2c8669e9e4f4fedc13f695e09a5db9ae86f0dd27..7d8ac2ca5ee5cdac5f55860f6d952d2f02ec116a 100644 --- a/checkpoints/checkpoint-6000/trainer_state.json +++ b/checkpoints/checkpoint-6000/trainer_state.json @@ -11,842 +11,842 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 }, { "epoch": 2.6699029126213594, - "grad_norm": 5.35228157043457, + "grad_norm": 5.231846809387207, "learning_rate": 4.555825242718447e-05, - "loss": 4.1612, + "loss": 4.0662, "step": 550 }, { "epoch": 2.912621359223301, - "grad_norm": 5.287161827087402, + "grad_norm": 5.506278038024902, "learning_rate": 4.51537216828479e-05, - "loss": 4.1322, + "loss": 4.0403, "step": 600 }, { "epoch": 3.1553398058252426, - "grad_norm": 5.2932209968566895, + "grad_norm": 5.410216331481934, "learning_rate": 4.4749190938511334e-05, - "loss": 3.9684, + "loss": 3.8375, "step": 650 }, { "epoch": 3.3980582524271843, - "grad_norm": 5.4360833168029785, + "grad_norm": 5.637699127197266, "learning_rate": 4.434466019417476e-05, - "loss": 3.8933, + "loss": 3.7473, "step": 700 }, { "epoch": 3.6407766990291264, - "grad_norm": 5.5595221519470215, + "grad_norm": 5.575273513793945, "learning_rate": 4.3940129449838194e-05, - "loss": 3.9306, + "loss": 3.7835, "step": 750 }, { "epoch": 3.883495145631068, - "grad_norm": 5.540086269378662, + "grad_norm": 5.737198352813721, "learning_rate": 4.353559870550162e-05, - "loss": 3.9317, + "loss": 3.7919, "step": 800 }, { "epoch": 4.12621359223301, - "grad_norm": 5.567137718200684, + "grad_norm": 5.645532608032227, "learning_rate": 4.313106796116505e-05, - "loss": 3.8135, + "loss": 3.6425, "step": 850 }, { "epoch": 4.368932038834951, - "grad_norm": 6.00346040725708, + "grad_norm": 6.308927059173584, "learning_rate": 4.272653721682848e-05, - "loss": 3.6924, + "loss": 3.4785, "step": 900 }, { "epoch": 4.611650485436893, - "grad_norm": 5.635782718658447, + "grad_norm": 5.863094329833984, "learning_rate": 4.232200647249191e-05, - "loss": 3.7304, + "loss": 3.5268, "step": 950 }, { "epoch": 4.854368932038835, - "grad_norm": 5.9545440673828125, + "grad_norm": 6.120258331298828, "learning_rate": 4.191747572815534e-05, - "loss": 3.7471, + "loss": 3.5296, "step": 1000 }, { "epoch": 5.097087378640777, - "grad_norm": 5.696166038513184, + "grad_norm": 6.231515884399414, "learning_rate": 4.1512944983818774e-05, - "loss": 3.6287, + "loss": 3.3802, "step": 1050 }, { "epoch": 5.339805825242719, - "grad_norm": 5.844620704650879, + "grad_norm": 6.211188316345215, "learning_rate": 4.11084142394822e-05, - "loss": 3.5299, + "loss": 3.2148, "step": 1100 }, { "epoch": 5.58252427184466, - "grad_norm": 6.373577117919922, + "grad_norm": 6.8676934242248535, "learning_rate": 4.0703883495145634e-05, - "loss": 3.5285, + "loss": 3.2111, "step": 1150 }, { "epoch": 5.825242718446602, - "grad_norm": 6.488651275634766, + "grad_norm": 6.851133346557617, "learning_rate": 4.029935275080906e-05, - "loss": 3.5789, + "loss": 3.2771, "step": 1200 }, { "epoch": 6.067961165048544, - "grad_norm": 6.170980930328369, + "grad_norm": 6.708765506744385, "learning_rate": 3.9894822006472494e-05, - "loss": 3.494, + "loss": 3.1587, "step": 1250 }, { "epoch": 6.310679611650485, - "grad_norm": 6.387782573699951, + "grad_norm": 7.136553764343262, "learning_rate": 3.949029126213593e-05, - "loss": 3.353, + "loss": 2.9192, "step": 1300 }, { "epoch": 6.553398058252427, - "grad_norm": 6.718175888061523, + "grad_norm": 7.370823383331299, "learning_rate": 3.9085760517799354e-05, - "loss": 3.3521, + "loss": 2.922, "step": 1350 }, { "epoch": 6.796116504854369, - "grad_norm": 6.7870025634765625, + "grad_norm": 7.630361557006836, "learning_rate": 3.868122977346279e-05, - "loss": 3.3658, + "loss": 2.9417, "step": 1400 }, { "epoch": 7.038834951456311, - "grad_norm": 6.513721466064453, + "grad_norm": 7.765511989593506, "learning_rate": 3.827669902912622e-05, - "loss": 3.3301, + "loss": 2.8669, "step": 1450 }, { "epoch": 7.281553398058253, - "grad_norm": 7.0680036544799805, + "grad_norm": 7.85439920425415, "learning_rate": 3.787216828478965e-05, - "loss": 3.1289, + "loss": 2.5412, "step": 1500 }, { "epoch": 7.524271844660194, - "grad_norm": 7.019896507263184, + "grad_norm": 8.064071655273438, "learning_rate": 3.746763754045307e-05, - "loss": 3.1878, + "loss": 2.6131, "step": 1550 }, { "epoch": 7.766990291262136, - "grad_norm": 7.3232197761535645, + "grad_norm": 8.692522048950195, "learning_rate": 3.7063106796116507e-05, - "loss": 3.2006, + "loss": 2.6356, "step": 1600 }, { "epoch": 8.009708737864077, - "grad_norm": 7.003912925720215, + "grad_norm": 8.012410163879395, "learning_rate": 3.665857605177993e-05, - "loss": 3.1737, + "loss": 2.599, "step": 1650 }, { "epoch": 8.25242718446602, - "grad_norm": 7.592433929443359, + "grad_norm": 8.6799898147583, "learning_rate": 3.6254045307443366e-05, - "loss": 2.9258, + "loss": 2.1992, "step": 1700 }, { "epoch": 8.495145631067961, - "grad_norm": 7.629278182983398, + "grad_norm": 8.963507652282715, "learning_rate": 3.584951456310679e-05, - "loss": 2.9999, + "loss": 2.2701, "step": 1750 }, { "epoch": 8.737864077669903, - "grad_norm": 8.137636184692383, + "grad_norm": 9.833822250366211, "learning_rate": 3.5444983818770226e-05, - "loss": 3.0155, + "loss": 2.2837, "step": 1800 }, { "epoch": 8.980582524271846, - "grad_norm": 7.967227935791016, + "grad_norm": 9.47242259979248, "learning_rate": 3.504045307443366e-05, - "loss": 2.9945, + "loss": 2.2854, "step": 1850 }, { "epoch": 9.223300970873787, - "grad_norm": 7.9910078048706055, + "grad_norm": 8.856402397155762, "learning_rate": 3.4635922330097086e-05, - "loss": 2.7456, + "loss": 1.8862, "step": 1900 }, { "epoch": 9.466019417475728, - "grad_norm": 8.552720069885254, + "grad_norm": 9.948341369628906, "learning_rate": 3.423139158576052e-05, - "loss": 2.8071, + "loss": 1.922, "step": 1950 }, { "epoch": 9.70873786407767, - "grad_norm": 8.336930274963379, + "grad_norm": 10.28939151763916, "learning_rate": 3.382686084142395e-05, - "loss": 2.8117, + "loss": 1.9622, "step": 2000 }, { "epoch": 9.951456310679612, - "grad_norm": 8.636338233947754, + "grad_norm": 10.114766120910645, "learning_rate": 3.342233009708738e-05, - "loss": 2.8197, + "loss": 1.9741, "step": 2050 }, { "epoch": 10.194174757281553, - "grad_norm": 7.955165863037109, + "grad_norm": 9.489270210266113, "learning_rate": 3.301779935275081e-05, - "loss": 2.6245, + "loss": 1.6509, "step": 2100 }, { "epoch": 10.436893203883495, - "grad_norm": 8.679149627685547, + "grad_norm": 10.16482162475586, "learning_rate": 3.2613268608414246e-05, - "loss": 2.565, + "loss": 1.5883, "step": 2150 }, { "epoch": 10.679611650485437, - "grad_norm": 9.150839805603027, + "grad_norm": 11.778059005737305, "learning_rate": 3.220873786407767e-05, - "loss": 2.6258, + "loss": 1.6234, "step": 2200 }, { "epoch": 10.922330097087379, - "grad_norm": 9.2735013961792, + "grad_norm": 10.798454284667969, "learning_rate": 3.1804207119741106e-05, - "loss": 2.6439, + "loss": 1.667, "step": 2250 }, { "epoch": 11.16504854368932, - "grad_norm": 8.620939254760742, + "grad_norm": 9.751585006713867, "learning_rate": 3.139967637540453e-05, - "loss": 2.4371, + "loss": 1.3785, "step": 2300 }, { "epoch": 11.407766990291263, - "grad_norm": 9.722040176391602, + "grad_norm": 10.266364097595215, "learning_rate": 3.099514563106796e-05, - "loss": 2.4074, + "loss": 1.2956, "step": 2350 }, { "epoch": 11.650485436893204, - "grad_norm": 9.200730323791504, + "grad_norm": 10.660856246948242, "learning_rate": 3.059061488673139e-05, - "loss": 2.4307, + "loss": 1.3321, "step": 2400 }, { "epoch": 11.893203883495145, - "grad_norm": 9.692273139953613, + "grad_norm": 10.896187782287598, "learning_rate": 3.0186084142394822e-05, - "loss": 2.424, + "loss": 1.3533, "step": 2450 }, { "epoch": 12.135922330097088, - "grad_norm": 9.822249412536621, + "grad_norm": 10.952502250671387, "learning_rate": 2.9781553398058252e-05, - "loss": 2.2862, + "loss": 1.1486, "step": 2500 }, { "epoch": 12.37864077669903, - "grad_norm": 9.919060707092285, + "grad_norm": 10.766463279724121, "learning_rate": 2.9377022653721686e-05, - "loss": 2.1913, + "loss": 1.0227, "step": 2550 }, { "epoch": 12.62135922330097, - "grad_norm": 9.181950569152832, + "grad_norm": 9.385764122009277, "learning_rate": 2.8972491909385112e-05, - "loss": 2.241, + "loss": 1.0651, "step": 2600 }, { "epoch": 12.864077669902912, - "grad_norm": 10.82077407836914, + "grad_norm": 11.573925018310547, "learning_rate": 2.8567961165048546e-05, - "loss": 2.2653, + "loss": 1.1013, "step": 2650 }, { "epoch": 13.106796116504855, - "grad_norm": 10.365830421447754, + "grad_norm": 10.138091087341309, "learning_rate": 2.816343042071198e-05, - "loss": 2.1352, + "loss": 0.9545, "step": 2700 }, { "epoch": 13.349514563106796, - "grad_norm": 10.073640823364258, + "grad_norm": 9.867119789123535, "learning_rate": 2.7758899676375405e-05, - "loss": 2.0227, + "loss": 0.7892, "step": 2750 }, { "epoch": 13.592233009708737, - "grad_norm": 10.520242691040039, + "grad_norm": 10.443971633911133, "learning_rate": 2.735436893203884e-05, - "loss": 2.0579, + "loss": 0.8432, "step": 2800 }, { "epoch": 13.83495145631068, - "grad_norm": 11.176478385925293, + "grad_norm": 10.633298873901367, "learning_rate": 2.6949838187702265e-05, - "loss": 2.0934, + "loss": 0.8543, "step": 2850 }, { "epoch": 14.077669902912621, - "grad_norm": 9.541868209838867, + "grad_norm": 8.640830039978027, "learning_rate": 2.6545307443365695e-05, - "loss": 2.0118, + "loss": 0.7751, "step": 2900 }, { "epoch": 14.320388349514563, - "grad_norm": 10.162981986999512, + "grad_norm": 10.461247444152832, "learning_rate": 2.614077669902913e-05, - "loss": 1.8624, + "loss": 0.6074, "step": 2950 }, { "epoch": 14.563106796116505, - "grad_norm": 10.51291275024414, + "grad_norm": 10.757479667663574, "learning_rate": 2.5736245954692555e-05, - "loss": 1.8979, + "loss": 0.6354, "step": 3000 }, { "epoch": 14.805825242718447, - "grad_norm": 11.316717147827148, + "grad_norm": 11.599132537841797, "learning_rate": 2.533171521035599e-05, - "loss": 1.915, + "loss": 0.6694, "step": 3050 }, { "epoch": 15.048543689320388, - "grad_norm": 9.184353828430176, + "grad_norm": 8.085358619689941, "learning_rate": 2.492718446601942e-05, - "loss": 1.885, + "loss": 0.6306, "step": 3100 }, { "epoch": 15.29126213592233, - "grad_norm": 10.70876693725586, + "grad_norm": 8.327988624572754, "learning_rate": 2.452265372168285e-05, - "loss": 1.7024, + "loss": 0.4628, "step": 3150 }, { "epoch": 15.533980582524272, - "grad_norm": 10.767953872680664, + "grad_norm": 8.545391082763672, "learning_rate": 2.411812297734628e-05, - "loss": 1.729, + "loss": 0.4915, "step": 3200 }, { "epoch": 15.776699029126213, - "grad_norm": 11.83687973022461, + "grad_norm": 10.176375389099121, "learning_rate": 2.3713592233009708e-05, - "loss": 1.7697, + "loss": 0.5105, "step": 3250 }, { "epoch": 16.019417475728154, - "grad_norm": 9.888689994812012, + "grad_norm": 7.821159362792969, "learning_rate": 2.3309061488673138e-05, - "loss": 1.7605, + "loss": 0.5055, "step": 3300 }, { "epoch": 16.262135922330096, - "grad_norm": 9.394012451171875, + "grad_norm": 7.797800064086914, "learning_rate": 2.290453074433657e-05, - "loss": 1.5389, + "loss": 0.3486, "step": 3350 }, { "epoch": 16.50485436893204, - "grad_norm": 11.3172025680542, + "grad_norm": 8.497370719909668, "learning_rate": 2.25e-05, - "loss": 1.5836, + "loss": 0.3627, "step": 3400 }, { "epoch": 16.74757281553398, - "grad_norm": 11.000115394592285, + "grad_norm": 9.842249870300293, "learning_rate": 2.209546925566343e-05, - "loss": 1.6398, + "loss": 0.3969, "step": 3450 }, { "epoch": 16.990291262135923, - "grad_norm": 11.453404426574707, + "grad_norm": 10.705739974975586, "learning_rate": 2.169093851132686e-05, - "loss": 1.6466, + "loss": 0.4028, "step": 3500 }, { "epoch": 17.233009708737864, - "grad_norm": 11.012279510498047, + "grad_norm": 7.950681686401367, "learning_rate": 2.1286407766990295e-05, - "loss": 1.4434, + "loss": 0.284, "step": 3550 }, { "epoch": 17.475728155339805, - "grad_norm": 11.989383697509766, + "grad_norm": 8.526535034179688, "learning_rate": 2.0881877022653725e-05, - "loss": 1.4708, + "loss": 0.2934, "step": 3600 }, { "epoch": 17.718446601941746, - "grad_norm": 12.47008991241455, + "grad_norm": 8.999290466308594, "learning_rate": 2.047734627831715e-05, - "loss": 1.4783, + "loss": 0.2964, "step": 3650 }, { "epoch": 17.96116504854369, - "grad_norm": 10.983756065368652, + "grad_norm": 9.228137969970703, "learning_rate": 2.007281553398058e-05, - "loss": 1.4839, + "loss": 0.311, "step": 3700 }, { "epoch": 18.203883495145632, - "grad_norm": 9.582510948181152, + "grad_norm": 6.715802192687988, "learning_rate": 1.9668284789644014e-05, - "loss": 1.3226, + "loss": 0.2263, "step": 3750 }, { "epoch": 18.446601941747574, - "grad_norm": 11.663228034973145, + "grad_norm": 7.834264278411865, "learning_rate": 1.9263754045307444e-05, - "loss": 1.3522, + "loss": 0.2293, "step": 3800 }, { "epoch": 18.689320388349515, - "grad_norm": 11.278139114379883, + "grad_norm": 6.83228874206543, "learning_rate": 1.8859223300970874e-05, - "loss": 1.3753, + "loss": 0.2392, "step": 3850 }, { "epoch": 18.932038834951456, - "grad_norm": 10.586488723754883, + "grad_norm": 8.507209777832031, "learning_rate": 1.8454692556634304e-05, - "loss": 1.3774, + "loss": 0.2377, "step": 3900 }, { "epoch": 19.174757281553397, - "grad_norm": 11.243274688720703, + "grad_norm": 6.565855026245117, "learning_rate": 1.8050161812297738e-05, - "loss": 1.2506, + "loss": 0.1905, "step": 3950 }, { "epoch": 19.41747572815534, - "grad_norm": 10.601967811584473, + "grad_norm": 7.470388889312744, "learning_rate": 1.7645631067961167e-05, - "loss": 1.2276, + "loss": 0.1846, "step": 4000 }, { "epoch": 19.660194174757283, - "grad_norm": 10.919422149658203, + "grad_norm": 7.033578395843506, "learning_rate": 1.7241100323624594e-05, - "loss": 1.2518, + "loss": 0.1911, "step": 4050 }, { "epoch": 19.902912621359224, - "grad_norm": 11.303253173828125, + "grad_norm": 8.110376358032227, "learning_rate": 1.6836569579288027e-05, - "loss": 1.279, + "loss": 0.1954, "step": 4100 }, { "epoch": 20.145631067961165, - "grad_norm": 10.864278793334961, + "grad_norm": 7.335451602935791, "learning_rate": 1.6432038834951457e-05, - "loss": 1.1686, + "loss": 0.1619, "step": 4150 }, { "epoch": 20.388349514563107, - "grad_norm": 11.179913520812988, + "grad_norm": 7.478648662567139, "learning_rate": 1.6027508090614887e-05, - "loss": 1.1252, + "loss": 0.1497, "step": 4200 }, { "epoch": 20.631067961165048, - "grad_norm": 10.640514373779297, + "grad_norm": 7.764925479888916, "learning_rate": 1.5622977346278317e-05, - "loss": 1.1581, + "loss": 0.1559, "step": 4250 }, { "epoch": 20.87378640776699, - "grad_norm": 11.186044692993164, + "grad_norm": 6.432894706726074, "learning_rate": 1.5218446601941749e-05, - "loss": 1.1782, + "loss": 0.1601, "step": 4300 }, { "epoch": 21.116504854368934, - "grad_norm": 11.105583190917969, + "grad_norm": 7.728542327880859, "learning_rate": 1.4813915857605179e-05, - "loss": 1.0972, + "loss": 0.1416, "step": 4350 }, { "epoch": 21.359223300970875, - "grad_norm": 10.809669494628906, + "grad_norm": 6.370733737945557, "learning_rate": 1.4409385113268609e-05, - "loss": 1.0487, + "loss": 0.1211, "step": 4400 }, { "epoch": 21.601941747572816, - "grad_norm": 11.11020278930664, + "grad_norm": 5.939419746398926, "learning_rate": 1.4004854368932039e-05, - "loss": 1.0566, + "loss": 0.1278, "step": 4450 }, { "epoch": 21.844660194174757, - "grad_norm": 11.062104225158691, + "grad_norm": 6.0013957023620605, "learning_rate": 1.360032362459547e-05, - "loss": 1.078, + "loss": 0.1274, "step": 4500 }, { "epoch": 22.0873786407767, - "grad_norm": 10.393362998962402, + "grad_norm": 6.1365966796875, "learning_rate": 1.31957928802589e-05, - "loss": 1.0351, + "loss": 0.1205, "step": 4550 }, { "epoch": 22.33009708737864, - "grad_norm": 9.539032936096191, + "grad_norm": 5.679115295410156, "learning_rate": 1.279126213592233e-05, - "loss": 0.9655, + "loss": 0.106, "step": 4600 }, { "epoch": 22.57281553398058, - "grad_norm": 10.375469207763672, + "grad_norm": 4.750304698944092, "learning_rate": 1.2386731391585762e-05, - "loss": 0.9857, + "loss": 0.108, "step": 4650 }, { "epoch": 22.815533980582526, - "grad_norm": 11.338729858398438, + "grad_norm": 6.115650653839111, "learning_rate": 1.1982200647249192e-05, - "loss": 0.9975, + "loss": 0.1086, "step": 4700 }, { "epoch": 23.058252427184467, - "grad_norm": 10.976126670837402, + "grad_norm": 5.252938270568848, "learning_rate": 1.1577669902912622e-05, - "loss": 0.9785, + "loss": 0.1041, "step": 4750 }, { "epoch": 23.300970873786408, - "grad_norm": 11.16224193572998, + "grad_norm": 5.175116062164307, "learning_rate": 1.1173139158576053e-05, - "loss": 0.9011, + "loss": 0.0872, "step": 4800 }, { "epoch": 23.54368932038835, - "grad_norm": 11.640942573547363, + "grad_norm": 5.768143653869629, "learning_rate": 1.0768608414239483e-05, - "loss": 0.9056, + "loss": 0.0945, "step": 4850 }, { "epoch": 23.78640776699029, - "grad_norm": 10.686304092407227, + "grad_norm": 5.235666275024414, "learning_rate": 1.0364077669902913e-05, - "loss": 0.9267, + "loss": 0.0969, "step": 4900 }, { "epoch": 24.02912621359223, - "grad_norm": 9.798011779785156, + "grad_norm": 4.407253265380859, "learning_rate": 9.959546925566343e-06, - "loss": 0.9134, + "loss": 0.0933, "step": 4950 }, { "epoch": 24.271844660194176, - "grad_norm": 10.638201713562012, + "grad_norm": 5.28978157043457, "learning_rate": 9.555016181229775e-06, - "loss": 0.8257, + "loss": 0.0766, "step": 5000 }, { "epoch": 24.514563106796118, - "grad_norm": 10.393739700317383, + "grad_norm": 6.059443950653076, "learning_rate": 9.150485436893205e-06, - "loss": 0.8574, + "loss": 0.082, "step": 5050 }, { "epoch": 24.75728155339806, - "grad_norm": 11.262478828430176, + "grad_norm": 4.627708435058594, "learning_rate": 8.745954692556635e-06, - "loss": 0.8645, + "loss": 0.0785, "step": 5100 }, { "epoch": 25.0, - "grad_norm": 12.033185005187988, + "grad_norm": 5.263092041015625, "learning_rate": 8.341423948220065e-06, - "loss": 0.8719, + "loss": 0.0816, "step": 5150 }, { "epoch": 25.24271844660194, - "grad_norm": 10.399738311767578, + "grad_norm": 6.337419509887695, "learning_rate": 7.936893203883496e-06, - "loss": 0.7727, + "loss": 0.0676, "step": 5200 }, { "epoch": 25.485436893203882, - "grad_norm": 9.93704605102539, + "grad_norm": 3.846252202987671, "learning_rate": 7.532362459546925e-06, - "loss": 0.799, + "loss": 0.0699, "step": 5250 }, { "epoch": 25.728155339805824, - "grad_norm": 9.828903198242188, + "grad_norm": 5.770230293273926, "learning_rate": 7.127831715210356e-06, - "loss": 0.8073, + "loss": 0.0673, "step": 5300 }, { "epoch": 25.97087378640777, - "grad_norm": 11.109928131103516, + "grad_norm": 4.10913610458374, "learning_rate": 6.723300970873788e-06, - "loss": 0.8155, + "loss": 0.0706, "step": 5350 }, { "epoch": 26.21359223300971, - "grad_norm": 10.467114448547363, + "grad_norm": 4.198057174682617, "learning_rate": 6.318770226537217e-06, - "loss": 0.7476, + "loss": 0.0615, "step": 5400 }, { "epoch": 26.45631067961165, - "grad_norm": 10.594189643859863, + "grad_norm": 5.369973182678223, "learning_rate": 5.914239482200648e-06, - "loss": 0.7518, + "loss": 0.0585, "step": 5450 }, { "epoch": 26.699029126213592, - "grad_norm": 9.673776626586914, + "grad_norm": 5.3213324546813965, "learning_rate": 5.5097087378640776e-06, - "loss": 0.7584, + "loss": 0.0594, "step": 5500 }, { "epoch": 26.941747572815533, - "grad_norm": 10.382043838500977, + "grad_norm": 3.871309518814087, "learning_rate": 5.105177993527508e-06, - "loss": 0.7637, + "loss": 0.058, "step": 5550 }, { "epoch": 27.184466019417474, - "grad_norm": 9.92479133605957, + "grad_norm": 5.000518798828125, "learning_rate": 4.700647249190938e-06, - "loss": 0.722, + "loss": 0.0562, "step": 5600 }, { "epoch": 27.42718446601942, - "grad_norm": 9.948137283325195, + "grad_norm": 5.81850004196167, "learning_rate": 4.296116504854369e-06, - "loss": 0.7195, + "loss": 0.0523, "step": 5650 }, { "epoch": 27.66990291262136, - "grad_norm": 10.299307823181152, + "grad_norm": 4.536961555480957, "learning_rate": 3.8915857605178e-06, - "loss": 0.7244, + "loss": 0.0552, "step": 5700 }, { "epoch": 27.9126213592233, - "grad_norm": 10.693897247314453, + "grad_norm": 3.8314859867095947, "learning_rate": 3.4870550161812302e-06, - "loss": 0.7299, + "loss": 0.0521, "step": 5750 }, { "epoch": 28.155339805825243, - "grad_norm": 9.83080768585205, + "grad_norm": 5.764403343200684, "learning_rate": 3.0825242718446606e-06, - "loss": 0.7041, + "loss": 0.0476, "step": 5800 }, { "epoch": 28.398058252427184, - "grad_norm": 10.031473159790039, + "grad_norm": 2.81915020942688, "learning_rate": 2.677993527508091e-06, - "loss": 0.6903, + "loss": 0.0484, "step": 5850 }, { "epoch": 28.640776699029125, - "grad_norm": 9.686220169067383, + "grad_norm": 5.184347152709961, "learning_rate": 2.2734627831715213e-06, - "loss": 0.6882, + "loss": 0.0459, "step": 5900 }, { "epoch": 28.883495145631066, - "grad_norm": 9.987640380859375, + "grad_norm": 2.9731695652008057, "learning_rate": 1.8689320388349515e-06, - "loss": 0.6918, + "loss": 0.0471, "step": 5950 }, { "epoch": 29.12621359223301, - "grad_norm": 9.621675491333008, + "grad_norm": 6.170887470245361, "learning_rate": 1.4644012944983818e-06, - "loss": 0.6878, + "loss": 0.045, "step": 6000 } ], @@ -867,7 +867,7 @@ "attributes": {} } }, - "total_flos": 783890251776000.0, + "total_flos": 2786120957952000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-6000/training_args.bin b/checkpoints/checkpoint-6000/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-6000/training_args.bin +++ b/checkpoints/checkpoint-6000/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/checkpoints/checkpoint-6180/config.json b/checkpoints/checkpoint-6180/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/checkpoints/checkpoint-6180/config.json +++ b/checkpoints/checkpoint-6180/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/checkpoints/checkpoint-6180/model.safetensors b/checkpoints/checkpoint-6180/model.safetensors index b789936bfb9c0ef9e42c2c304f40eb4b62d54c64..3e79c22b7e5b8885bb0e24ffa22944eaec0c8d2e 100644 --- a/checkpoints/checkpoint-6180/model.safetensors +++ b/checkpoints/checkpoint-6180/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:964574ee2496392ed8546d35dde6ee1c393d9db5c163a62154e223c6dc7aa7c8 -size 176343496 +oid sha256:20d550b61dd9790a42142014e712c6d94de2be31139ec39dcbbf926a20d95619 +size 617130824 diff --git a/checkpoints/checkpoint-6180/optimizer.pt b/checkpoints/checkpoint-6180/optimizer.pt index be0701b94c3451b7a6561cd815b96cc3d3845aaf..47658092cdb00dd79af436431a98a6134ab330a4 100644 --- a/checkpoints/checkpoint-6180/optimizer.pt +++ b/checkpoints/checkpoint-6180/optimizer.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41c1d724dec0e9d865699fd343545dcbed97842e66261224d912cd6f01eaac9a -size 352735610 +oid sha256:a251771a5e13325b181850cf9862a9948c2731ad943eb0252d68ff7b9544c892 +size 1234355130 diff --git a/checkpoints/checkpoint-6180/rng_state.pth b/checkpoints/checkpoint-6180/rng_state.pth index c1e5b2cd65613d44c849b14f2d1f77a175198c9d..2cebe129bfc9588f1337923cd898a09b48dd95a6 100644 --- a/checkpoints/checkpoint-6180/rng_state.pth +++ b/checkpoints/checkpoint-6180/rng_state.pth @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb9e62e9bfff34e9d5a662f063dc6cf7ce6fd6c542ae5a0aeac8570a1e1f5447 +oid sha256:d5c9a04f545dc7245fc5ec685eaff914bc883e1a413118dea0d023286d0b711a size 14244 diff --git a/checkpoints/checkpoint-6180/trainer_state.json b/checkpoints/checkpoint-6180/trainer_state.json index 94c61d1bdbbde8182b652b587686ed1e8b0c8443..d48df7fa4a45348a8a1ceb07f2cae04aad8a0a20 100644 --- a/checkpoints/checkpoint-6180/trainer_state.json +++ b/checkpoints/checkpoint-6180/trainer_state.json @@ -11,863 +11,863 @@ "log_history": [ { "epoch": 0.24271844660194175, - "grad_norm": 2.239194869995117, + "grad_norm": 2.8362834453582764, "learning_rate": 4.960355987055016e-05, - "loss": 6.099, + "loss": 6.0742, "step": 50 }, { "epoch": 0.4854368932038835, - "grad_norm": 3.8653194904327393, + "grad_norm": 2.9484496116638184, "learning_rate": 4.9199029126213595e-05, - "loss": 5.6954, + "loss": 5.7218, "step": 100 }, { "epoch": 0.7281553398058253, - "grad_norm": 4.652886867523193, + "grad_norm": 3.542572259902954, "learning_rate": 4.879449838187702e-05, - "loss": 5.3598, + "loss": 5.4227, "step": 150 }, { "epoch": 0.970873786407767, - "grad_norm": 5.048999309539795, + "grad_norm": 4.634098052978516, "learning_rate": 4.8389967637540455e-05, - "loss": 5.0982, + "loss": 5.1655, "step": 200 }, { "epoch": 1.2135922330097086, - "grad_norm": 5.21944522857666, + "grad_norm": 5.699330806732178, "learning_rate": 4.798543689320388e-05, - "loss": 4.7456, + "loss": 4.8009, "step": 250 }, { "epoch": 1.4563106796116505, - "grad_norm": 5.471007347106934, + "grad_norm": 5.711933135986328, "learning_rate": 4.7580906148867315e-05, - "loss": 4.6254, + "loss": 4.607, "step": 300 }, { "epoch": 1.6990291262135924, - "grad_norm": 5.345245838165283, + "grad_norm": 5.113691806793213, "learning_rate": 4.717637540453075e-05, - "loss": 4.5216, + "loss": 4.462, "step": 350 }, { "epoch": 1.941747572815534, - "grad_norm": 5.54531192779541, + "grad_norm": 5.632521152496338, "learning_rate": 4.6771844660194174e-05, - "loss": 4.4353, + "loss": 4.3695, "step": 400 }, { "epoch": 2.1844660194174756, - "grad_norm": 5.244582176208496, + "grad_norm": 5.428906440734863, "learning_rate": 4.636731391585761e-05, - "loss": 4.2444, + "loss": 4.1549, "step": 450 }, { "epoch": 2.4271844660194173, - "grad_norm": 5.338261127471924, + "grad_norm": 5.037013530731201, "learning_rate": 4.596278317152104e-05, - "loss": 4.1915, + "loss": 4.0921, "step": 500 }, { "epoch": 2.6699029126213594, - "grad_norm": 5.35228157043457, + "grad_norm": 5.231846809387207, "learning_rate": 4.555825242718447e-05, - "loss": 4.1612, + "loss": 4.0662, "step": 550 }, { "epoch": 2.912621359223301, - "grad_norm": 5.287161827087402, + "grad_norm": 5.506278038024902, "learning_rate": 4.51537216828479e-05, - "loss": 4.1322, + "loss": 4.0403, "step": 600 }, { "epoch": 3.1553398058252426, - "grad_norm": 5.2932209968566895, + "grad_norm": 5.410216331481934, "learning_rate": 4.4749190938511334e-05, - "loss": 3.9684, + "loss": 3.8375, "step": 650 }, { "epoch": 3.3980582524271843, - "grad_norm": 5.4360833168029785, + "grad_norm": 5.637699127197266, "learning_rate": 4.434466019417476e-05, - "loss": 3.8933, + "loss": 3.7473, "step": 700 }, { "epoch": 3.6407766990291264, - "grad_norm": 5.5595221519470215, + "grad_norm": 5.575273513793945, "learning_rate": 4.3940129449838194e-05, - "loss": 3.9306, + "loss": 3.7835, "step": 750 }, { "epoch": 3.883495145631068, - "grad_norm": 5.540086269378662, + "grad_norm": 5.737198352813721, "learning_rate": 4.353559870550162e-05, - "loss": 3.9317, + "loss": 3.7919, "step": 800 }, { "epoch": 4.12621359223301, - "grad_norm": 5.567137718200684, + "grad_norm": 5.645532608032227, "learning_rate": 4.313106796116505e-05, - "loss": 3.8135, + "loss": 3.6425, "step": 850 }, { "epoch": 4.368932038834951, - "grad_norm": 6.00346040725708, + "grad_norm": 6.308927059173584, "learning_rate": 4.272653721682848e-05, - "loss": 3.6924, + "loss": 3.4785, "step": 900 }, { "epoch": 4.611650485436893, - "grad_norm": 5.635782718658447, + "grad_norm": 5.863094329833984, "learning_rate": 4.232200647249191e-05, - "loss": 3.7304, + "loss": 3.5268, "step": 950 }, { "epoch": 4.854368932038835, - "grad_norm": 5.9545440673828125, + "grad_norm": 6.120258331298828, "learning_rate": 4.191747572815534e-05, - "loss": 3.7471, + "loss": 3.5296, "step": 1000 }, { "epoch": 5.097087378640777, - "grad_norm": 5.696166038513184, + "grad_norm": 6.231515884399414, "learning_rate": 4.1512944983818774e-05, - "loss": 3.6287, + "loss": 3.3802, "step": 1050 }, { "epoch": 5.339805825242719, - "grad_norm": 5.844620704650879, + "grad_norm": 6.211188316345215, "learning_rate": 4.11084142394822e-05, - "loss": 3.5299, + "loss": 3.2148, "step": 1100 }, { "epoch": 5.58252427184466, - "grad_norm": 6.373577117919922, + "grad_norm": 6.8676934242248535, "learning_rate": 4.0703883495145634e-05, - "loss": 3.5285, + "loss": 3.2111, "step": 1150 }, { "epoch": 5.825242718446602, - "grad_norm": 6.488651275634766, + "grad_norm": 6.851133346557617, "learning_rate": 4.029935275080906e-05, - "loss": 3.5789, + "loss": 3.2771, "step": 1200 }, { "epoch": 6.067961165048544, - "grad_norm": 6.170980930328369, + "grad_norm": 6.708765506744385, "learning_rate": 3.9894822006472494e-05, - "loss": 3.494, + "loss": 3.1587, "step": 1250 }, { "epoch": 6.310679611650485, - "grad_norm": 6.387782573699951, + "grad_norm": 7.136553764343262, "learning_rate": 3.949029126213593e-05, - "loss": 3.353, + "loss": 2.9192, "step": 1300 }, { "epoch": 6.553398058252427, - "grad_norm": 6.718175888061523, + "grad_norm": 7.370823383331299, "learning_rate": 3.9085760517799354e-05, - "loss": 3.3521, + "loss": 2.922, "step": 1350 }, { "epoch": 6.796116504854369, - "grad_norm": 6.7870025634765625, + "grad_norm": 7.630361557006836, "learning_rate": 3.868122977346279e-05, - "loss": 3.3658, + "loss": 2.9417, "step": 1400 }, { "epoch": 7.038834951456311, - "grad_norm": 6.513721466064453, + "grad_norm": 7.765511989593506, "learning_rate": 3.827669902912622e-05, - "loss": 3.3301, + "loss": 2.8669, "step": 1450 }, { "epoch": 7.281553398058253, - "grad_norm": 7.0680036544799805, + "grad_norm": 7.85439920425415, "learning_rate": 3.787216828478965e-05, - "loss": 3.1289, + "loss": 2.5412, "step": 1500 }, { "epoch": 7.524271844660194, - "grad_norm": 7.019896507263184, + "grad_norm": 8.064071655273438, "learning_rate": 3.746763754045307e-05, - "loss": 3.1878, + "loss": 2.6131, "step": 1550 }, { "epoch": 7.766990291262136, - "grad_norm": 7.3232197761535645, + "grad_norm": 8.692522048950195, "learning_rate": 3.7063106796116507e-05, - "loss": 3.2006, + "loss": 2.6356, "step": 1600 }, { "epoch": 8.009708737864077, - "grad_norm": 7.003912925720215, + "grad_norm": 8.012410163879395, "learning_rate": 3.665857605177993e-05, - "loss": 3.1737, + "loss": 2.599, "step": 1650 }, { "epoch": 8.25242718446602, - "grad_norm": 7.592433929443359, + "grad_norm": 8.6799898147583, "learning_rate": 3.6254045307443366e-05, - "loss": 2.9258, + "loss": 2.1992, "step": 1700 }, { "epoch": 8.495145631067961, - "grad_norm": 7.629278182983398, + "grad_norm": 8.963507652282715, "learning_rate": 3.584951456310679e-05, - "loss": 2.9999, + "loss": 2.2701, "step": 1750 }, { "epoch": 8.737864077669903, - "grad_norm": 8.137636184692383, + "grad_norm": 9.833822250366211, "learning_rate": 3.5444983818770226e-05, - "loss": 3.0155, + "loss": 2.2837, "step": 1800 }, { "epoch": 8.980582524271846, - "grad_norm": 7.967227935791016, + "grad_norm": 9.47242259979248, "learning_rate": 3.504045307443366e-05, - "loss": 2.9945, + "loss": 2.2854, "step": 1850 }, { "epoch": 9.223300970873787, - "grad_norm": 7.9910078048706055, + "grad_norm": 8.856402397155762, "learning_rate": 3.4635922330097086e-05, - "loss": 2.7456, + "loss": 1.8862, "step": 1900 }, { "epoch": 9.466019417475728, - "grad_norm": 8.552720069885254, + "grad_norm": 9.948341369628906, "learning_rate": 3.423139158576052e-05, - "loss": 2.8071, + "loss": 1.922, "step": 1950 }, { "epoch": 9.70873786407767, - "grad_norm": 8.336930274963379, + "grad_norm": 10.28939151763916, "learning_rate": 3.382686084142395e-05, - "loss": 2.8117, + "loss": 1.9622, "step": 2000 }, { "epoch": 9.951456310679612, - "grad_norm": 8.636338233947754, + "grad_norm": 10.114766120910645, "learning_rate": 3.342233009708738e-05, - "loss": 2.8197, + "loss": 1.9741, "step": 2050 }, { "epoch": 10.194174757281553, - "grad_norm": 7.955165863037109, + "grad_norm": 9.489270210266113, "learning_rate": 3.301779935275081e-05, - "loss": 2.6245, + "loss": 1.6509, "step": 2100 }, { "epoch": 10.436893203883495, - "grad_norm": 8.679149627685547, + "grad_norm": 10.16482162475586, "learning_rate": 3.2613268608414246e-05, - "loss": 2.565, + "loss": 1.5883, "step": 2150 }, { "epoch": 10.679611650485437, - "grad_norm": 9.150839805603027, + "grad_norm": 11.778059005737305, "learning_rate": 3.220873786407767e-05, - "loss": 2.6258, + "loss": 1.6234, "step": 2200 }, { "epoch": 10.922330097087379, - "grad_norm": 9.2735013961792, + "grad_norm": 10.798454284667969, "learning_rate": 3.1804207119741106e-05, - "loss": 2.6439, + "loss": 1.667, "step": 2250 }, { "epoch": 11.16504854368932, - "grad_norm": 8.620939254760742, + "grad_norm": 9.751585006713867, "learning_rate": 3.139967637540453e-05, - "loss": 2.4371, + "loss": 1.3785, "step": 2300 }, { "epoch": 11.407766990291263, - "grad_norm": 9.722040176391602, + "grad_norm": 10.266364097595215, "learning_rate": 3.099514563106796e-05, - "loss": 2.4074, + "loss": 1.2956, "step": 2350 }, { "epoch": 11.650485436893204, - "grad_norm": 9.200730323791504, + "grad_norm": 10.660856246948242, "learning_rate": 3.059061488673139e-05, - "loss": 2.4307, + "loss": 1.3321, "step": 2400 }, { "epoch": 11.893203883495145, - "grad_norm": 9.692273139953613, + "grad_norm": 10.896187782287598, "learning_rate": 3.0186084142394822e-05, - "loss": 2.424, + "loss": 1.3533, "step": 2450 }, { "epoch": 12.135922330097088, - "grad_norm": 9.822249412536621, + "grad_norm": 10.952502250671387, "learning_rate": 2.9781553398058252e-05, - "loss": 2.2862, + "loss": 1.1486, "step": 2500 }, { "epoch": 12.37864077669903, - "grad_norm": 9.919060707092285, + "grad_norm": 10.766463279724121, "learning_rate": 2.9377022653721686e-05, - "loss": 2.1913, + "loss": 1.0227, "step": 2550 }, { "epoch": 12.62135922330097, - "grad_norm": 9.181950569152832, + "grad_norm": 9.385764122009277, "learning_rate": 2.8972491909385112e-05, - "loss": 2.241, + "loss": 1.0651, "step": 2600 }, { "epoch": 12.864077669902912, - "grad_norm": 10.82077407836914, + "grad_norm": 11.573925018310547, "learning_rate": 2.8567961165048546e-05, - "loss": 2.2653, + "loss": 1.1013, "step": 2650 }, { "epoch": 13.106796116504855, - "grad_norm": 10.365830421447754, + "grad_norm": 10.138091087341309, "learning_rate": 2.816343042071198e-05, - "loss": 2.1352, + "loss": 0.9545, "step": 2700 }, { "epoch": 13.349514563106796, - "grad_norm": 10.073640823364258, + "grad_norm": 9.867119789123535, "learning_rate": 2.7758899676375405e-05, - "loss": 2.0227, + "loss": 0.7892, "step": 2750 }, { "epoch": 13.592233009708737, - "grad_norm": 10.520242691040039, + "grad_norm": 10.443971633911133, "learning_rate": 2.735436893203884e-05, - "loss": 2.0579, + "loss": 0.8432, "step": 2800 }, { "epoch": 13.83495145631068, - "grad_norm": 11.176478385925293, + "grad_norm": 10.633298873901367, "learning_rate": 2.6949838187702265e-05, - "loss": 2.0934, + "loss": 0.8543, "step": 2850 }, { "epoch": 14.077669902912621, - "grad_norm": 9.541868209838867, + "grad_norm": 8.640830039978027, "learning_rate": 2.6545307443365695e-05, - "loss": 2.0118, + "loss": 0.7751, "step": 2900 }, { "epoch": 14.320388349514563, - "grad_norm": 10.162981986999512, + "grad_norm": 10.461247444152832, "learning_rate": 2.614077669902913e-05, - "loss": 1.8624, + "loss": 0.6074, "step": 2950 }, { "epoch": 14.563106796116505, - "grad_norm": 10.51291275024414, + "grad_norm": 10.757479667663574, "learning_rate": 2.5736245954692555e-05, - "loss": 1.8979, + "loss": 0.6354, "step": 3000 }, { "epoch": 14.805825242718447, - "grad_norm": 11.316717147827148, + "grad_norm": 11.599132537841797, "learning_rate": 2.533171521035599e-05, - "loss": 1.915, + "loss": 0.6694, "step": 3050 }, { "epoch": 15.048543689320388, - "grad_norm": 9.184353828430176, + "grad_norm": 8.085358619689941, "learning_rate": 2.492718446601942e-05, - "loss": 1.885, + "loss": 0.6306, "step": 3100 }, { "epoch": 15.29126213592233, - "grad_norm": 10.70876693725586, + "grad_norm": 8.327988624572754, "learning_rate": 2.452265372168285e-05, - "loss": 1.7024, + "loss": 0.4628, "step": 3150 }, { "epoch": 15.533980582524272, - "grad_norm": 10.767953872680664, + "grad_norm": 8.545391082763672, "learning_rate": 2.411812297734628e-05, - "loss": 1.729, + "loss": 0.4915, "step": 3200 }, { "epoch": 15.776699029126213, - "grad_norm": 11.83687973022461, + "grad_norm": 10.176375389099121, "learning_rate": 2.3713592233009708e-05, - "loss": 1.7697, + "loss": 0.5105, "step": 3250 }, { "epoch": 16.019417475728154, - "grad_norm": 9.888689994812012, + "grad_norm": 7.821159362792969, "learning_rate": 2.3309061488673138e-05, - "loss": 1.7605, + "loss": 0.5055, "step": 3300 }, { "epoch": 16.262135922330096, - "grad_norm": 9.394012451171875, + "grad_norm": 7.797800064086914, "learning_rate": 2.290453074433657e-05, - "loss": 1.5389, + "loss": 0.3486, "step": 3350 }, { "epoch": 16.50485436893204, - "grad_norm": 11.3172025680542, + "grad_norm": 8.497370719909668, "learning_rate": 2.25e-05, - "loss": 1.5836, + "loss": 0.3627, "step": 3400 }, { "epoch": 16.74757281553398, - "grad_norm": 11.000115394592285, + "grad_norm": 9.842249870300293, "learning_rate": 2.209546925566343e-05, - "loss": 1.6398, + "loss": 0.3969, "step": 3450 }, { "epoch": 16.990291262135923, - "grad_norm": 11.453404426574707, + "grad_norm": 10.705739974975586, "learning_rate": 2.169093851132686e-05, - "loss": 1.6466, + "loss": 0.4028, "step": 3500 }, { "epoch": 17.233009708737864, - "grad_norm": 11.012279510498047, + "grad_norm": 7.950681686401367, "learning_rate": 2.1286407766990295e-05, - "loss": 1.4434, + "loss": 0.284, "step": 3550 }, { "epoch": 17.475728155339805, - "grad_norm": 11.989383697509766, + "grad_norm": 8.526535034179688, "learning_rate": 2.0881877022653725e-05, - "loss": 1.4708, + "loss": 0.2934, "step": 3600 }, { "epoch": 17.718446601941746, - "grad_norm": 12.47008991241455, + "grad_norm": 8.999290466308594, "learning_rate": 2.047734627831715e-05, - "loss": 1.4783, + "loss": 0.2964, "step": 3650 }, { "epoch": 17.96116504854369, - "grad_norm": 10.983756065368652, + "grad_norm": 9.228137969970703, "learning_rate": 2.007281553398058e-05, - "loss": 1.4839, + "loss": 0.311, "step": 3700 }, { "epoch": 18.203883495145632, - "grad_norm": 9.582510948181152, + "grad_norm": 6.715802192687988, "learning_rate": 1.9668284789644014e-05, - "loss": 1.3226, + "loss": 0.2263, "step": 3750 }, { "epoch": 18.446601941747574, - "grad_norm": 11.663228034973145, + "grad_norm": 7.834264278411865, "learning_rate": 1.9263754045307444e-05, - "loss": 1.3522, + "loss": 0.2293, "step": 3800 }, { "epoch": 18.689320388349515, - "grad_norm": 11.278139114379883, + "grad_norm": 6.83228874206543, "learning_rate": 1.8859223300970874e-05, - "loss": 1.3753, + "loss": 0.2392, "step": 3850 }, { "epoch": 18.932038834951456, - "grad_norm": 10.586488723754883, + "grad_norm": 8.507209777832031, "learning_rate": 1.8454692556634304e-05, - "loss": 1.3774, + "loss": 0.2377, "step": 3900 }, { "epoch": 19.174757281553397, - "grad_norm": 11.243274688720703, + "grad_norm": 6.565855026245117, "learning_rate": 1.8050161812297738e-05, - "loss": 1.2506, + "loss": 0.1905, "step": 3950 }, { "epoch": 19.41747572815534, - "grad_norm": 10.601967811584473, + "grad_norm": 7.470388889312744, "learning_rate": 1.7645631067961167e-05, - "loss": 1.2276, + "loss": 0.1846, "step": 4000 }, { "epoch": 19.660194174757283, - "grad_norm": 10.919422149658203, + "grad_norm": 7.033578395843506, "learning_rate": 1.7241100323624594e-05, - "loss": 1.2518, + "loss": 0.1911, "step": 4050 }, { "epoch": 19.902912621359224, - "grad_norm": 11.303253173828125, + "grad_norm": 8.110376358032227, "learning_rate": 1.6836569579288027e-05, - "loss": 1.279, + "loss": 0.1954, "step": 4100 }, { "epoch": 20.145631067961165, - "grad_norm": 10.864278793334961, + "grad_norm": 7.335451602935791, "learning_rate": 1.6432038834951457e-05, - "loss": 1.1686, + "loss": 0.1619, "step": 4150 }, { "epoch": 20.388349514563107, - "grad_norm": 11.179913520812988, + "grad_norm": 7.478648662567139, "learning_rate": 1.6027508090614887e-05, - "loss": 1.1252, + "loss": 0.1497, "step": 4200 }, { "epoch": 20.631067961165048, - "grad_norm": 10.640514373779297, + "grad_norm": 7.764925479888916, "learning_rate": 1.5622977346278317e-05, - "loss": 1.1581, + "loss": 0.1559, "step": 4250 }, { "epoch": 20.87378640776699, - "grad_norm": 11.186044692993164, + "grad_norm": 6.432894706726074, "learning_rate": 1.5218446601941749e-05, - "loss": 1.1782, + "loss": 0.1601, "step": 4300 }, { "epoch": 21.116504854368934, - "grad_norm": 11.105583190917969, + "grad_norm": 7.728542327880859, "learning_rate": 1.4813915857605179e-05, - "loss": 1.0972, + "loss": 0.1416, "step": 4350 }, { "epoch": 21.359223300970875, - "grad_norm": 10.809669494628906, + "grad_norm": 6.370733737945557, "learning_rate": 1.4409385113268609e-05, - "loss": 1.0487, + "loss": 0.1211, "step": 4400 }, { "epoch": 21.601941747572816, - "grad_norm": 11.11020278930664, + "grad_norm": 5.939419746398926, "learning_rate": 1.4004854368932039e-05, - "loss": 1.0566, + "loss": 0.1278, "step": 4450 }, { "epoch": 21.844660194174757, - "grad_norm": 11.062104225158691, + "grad_norm": 6.0013957023620605, "learning_rate": 1.360032362459547e-05, - "loss": 1.078, + "loss": 0.1274, "step": 4500 }, { "epoch": 22.0873786407767, - "grad_norm": 10.393362998962402, + "grad_norm": 6.1365966796875, "learning_rate": 1.31957928802589e-05, - "loss": 1.0351, + "loss": 0.1205, "step": 4550 }, { "epoch": 22.33009708737864, - "grad_norm": 9.539032936096191, + "grad_norm": 5.679115295410156, "learning_rate": 1.279126213592233e-05, - "loss": 0.9655, + "loss": 0.106, "step": 4600 }, { "epoch": 22.57281553398058, - "grad_norm": 10.375469207763672, + "grad_norm": 4.750304698944092, "learning_rate": 1.2386731391585762e-05, - "loss": 0.9857, + "loss": 0.108, "step": 4650 }, { "epoch": 22.815533980582526, - "grad_norm": 11.338729858398438, + "grad_norm": 6.115650653839111, "learning_rate": 1.1982200647249192e-05, - "loss": 0.9975, + "loss": 0.1086, "step": 4700 }, { "epoch": 23.058252427184467, - "grad_norm": 10.976126670837402, + "grad_norm": 5.252938270568848, "learning_rate": 1.1577669902912622e-05, - "loss": 0.9785, + "loss": 0.1041, "step": 4750 }, { "epoch": 23.300970873786408, - "grad_norm": 11.16224193572998, + "grad_norm": 5.175116062164307, "learning_rate": 1.1173139158576053e-05, - "loss": 0.9011, + "loss": 0.0872, "step": 4800 }, { "epoch": 23.54368932038835, - "grad_norm": 11.640942573547363, + "grad_norm": 5.768143653869629, "learning_rate": 1.0768608414239483e-05, - "loss": 0.9056, + "loss": 0.0945, "step": 4850 }, { "epoch": 23.78640776699029, - "grad_norm": 10.686304092407227, + "grad_norm": 5.235666275024414, "learning_rate": 1.0364077669902913e-05, - "loss": 0.9267, + "loss": 0.0969, "step": 4900 }, { "epoch": 24.02912621359223, - "grad_norm": 9.798011779785156, + "grad_norm": 4.407253265380859, "learning_rate": 9.959546925566343e-06, - "loss": 0.9134, + "loss": 0.0933, "step": 4950 }, { "epoch": 24.271844660194176, - "grad_norm": 10.638201713562012, + "grad_norm": 5.28978157043457, "learning_rate": 9.555016181229775e-06, - "loss": 0.8257, + "loss": 0.0766, "step": 5000 }, { "epoch": 24.514563106796118, - "grad_norm": 10.393739700317383, + "grad_norm": 6.059443950653076, "learning_rate": 9.150485436893205e-06, - "loss": 0.8574, + "loss": 0.082, "step": 5050 }, { "epoch": 24.75728155339806, - "grad_norm": 11.262478828430176, + "grad_norm": 4.627708435058594, "learning_rate": 8.745954692556635e-06, - "loss": 0.8645, + "loss": 0.0785, "step": 5100 }, { "epoch": 25.0, - "grad_norm": 12.033185005187988, + "grad_norm": 5.263092041015625, "learning_rate": 8.341423948220065e-06, - "loss": 0.8719, + "loss": 0.0816, "step": 5150 }, { "epoch": 25.24271844660194, - "grad_norm": 10.399738311767578, + "grad_norm": 6.337419509887695, "learning_rate": 7.936893203883496e-06, - "loss": 0.7727, + "loss": 0.0676, "step": 5200 }, { "epoch": 25.485436893203882, - "grad_norm": 9.93704605102539, + "grad_norm": 3.846252202987671, "learning_rate": 7.532362459546925e-06, - "loss": 0.799, + "loss": 0.0699, "step": 5250 }, { "epoch": 25.728155339805824, - "grad_norm": 9.828903198242188, + "grad_norm": 5.770230293273926, "learning_rate": 7.127831715210356e-06, - "loss": 0.8073, + "loss": 0.0673, "step": 5300 }, { "epoch": 25.97087378640777, - "grad_norm": 11.109928131103516, + "grad_norm": 4.10913610458374, "learning_rate": 6.723300970873788e-06, - "loss": 0.8155, + "loss": 0.0706, "step": 5350 }, { "epoch": 26.21359223300971, - "grad_norm": 10.467114448547363, + "grad_norm": 4.198057174682617, "learning_rate": 6.318770226537217e-06, - "loss": 0.7476, + "loss": 0.0615, "step": 5400 }, { "epoch": 26.45631067961165, - "grad_norm": 10.594189643859863, + "grad_norm": 5.369973182678223, "learning_rate": 5.914239482200648e-06, - "loss": 0.7518, + "loss": 0.0585, "step": 5450 }, { "epoch": 26.699029126213592, - "grad_norm": 9.673776626586914, + "grad_norm": 5.3213324546813965, "learning_rate": 5.5097087378640776e-06, - "loss": 0.7584, + "loss": 0.0594, "step": 5500 }, { "epoch": 26.941747572815533, - "grad_norm": 10.382043838500977, + "grad_norm": 3.871309518814087, "learning_rate": 5.105177993527508e-06, - "loss": 0.7637, + "loss": 0.058, "step": 5550 }, { "epoch": 27.184466019417474, - "grad_norm": 9.92479133605957, + "grad_norm": 5.000518798828125, "learning_rate": 4.700647249190938e-06, - "loss": 0.722, + "loss": 0.0562, "step": 5600 }, { "epoch": 27.42718446601942, - "grad_norm": 9.948137283325195, + "grad_norm": 5.81850004196167, "learning_rate": 4.296116504854369e-06, - "loss": 0.7195, + "loss": 0.0523, "step": 5650 }, { "epoch": 27.66990291262136, - "grad_norm": 10.299307823181152, + "grad_norm": 4.536961555480957, "learning_rate": 3.8915857605178e-06, - "loss": 0.7244, + "loss": 0.0552, "step": 5700 }, { "epoch": 27.9126213592233, - "grad_norm": 10.693897247314453, + "grad_norm": 3.8314859867095947, "learning_rate": 3.4870550161812302e-06, - "loss": 0.7299, + "loss": 0.0521, "step": 5750 }, { "epoch": 28.155339805825243, - "grad_norm": 9.83080768585205, + "grad_norm": 5.764403343200684, "learning_rate": 3.0825242718446606e-06, - "loss": 0.7041, + "loss": 0.0476, "step": 5800 }, { "epoch": 28.398058252427184, - "grad_norm": 10.031473159790039, + "grad_norm": 2.81915020942688, "learning_rate": 2.677993527508091e-06, - "loss": 0.6903, + "loss": 0.0484, "step": 5850 }, { "epoch": 28.640776699029125, - "grad_norm": 9.686220169067383, + "grad_norm": 5.184347152709961, "learning_rate": 2.2734627831715213e-06, - "loss": 0.6882, + "loss": 0.0459, "step": 5900 }, { "epoch": 28.883495145631066, - "grad_norm": 9.987640380859375, + "grad_norm": 2.9731695652008057, "learning_rate": 1.8689320388349515e-06, - "loss": 0.6918, + "loss": 0.0471, "step": 5950 }, { "epoch": 29.12621359223301, - "grad_norm": 9.621675491333008, + "grad_norm": 6.170887470245361, "learning_rate": 1.4644012944983818e-06, - "loss": 0.6878, + "loss": 0.045, "step": 6000 }, { "epoch": 29.368932038834952, - "grad_norm": 9.286832809448242, + "grad_norm": 2.6630172729492188, "learning_rate": 1.0598705501618124e-06, - "loss": 0.6723, + "loss": 0.0428, "step": 6050 }, { "epoch": 29.611650485436893, - "grad_norm": 8.733819961547852, + "grad_norm": 2.1566600799560547, "learning_rate": 6.553398058252428e-07, - "loss": 0.6721, + "loss": 0.0435, "step": 6100 }, { "epoch": 29.854368932038835, - "grad_norm": 10.382695198059082, + "grad_norm": 5.664730072021484, "learning_rate": 2.5080906148867315e-07, - "loss": 0.6753, + "loss": 0.0445, "step": 6150 } ], @@ -888,7 +888,7 @@ "attributes": {} } }, - "total_flos": 807406959329280.0, + "total_flos": 2869704586690560.0, "train_batch_size": 4, "trial_name": null, "trial_params": null diff --git a/checkpoints/checkpoint-6180/training_args.bin b/checkpoints/checkpoint-6180/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/checkpoints/checkpoint-6180/training_args.bin +++ b/checkpoints/checkpoint-6180/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304 diff --git a/config.json b/config.json index fdd810faba252b1c3f085663999fe9c24e8e44d6..990b1515737ce9bfc8638b4941fcfacdce1334d3 100644 --- a/config.json +++ b/config.json @@ -10,12 +10,12 @@ "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, + "n_ctx": 2048, + "n_embd": 1024, "n_head": 2, "n_inner": null, - "n_layer": 6, - "n_positions": 1024, + "n_layer": 12, + "n_positions": 2048, "pad_token_id": 1, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, diff --git a/model.safetensors b/model.safetensors index b789936bfb9c0ef9e42c2c304f40eb4b62d54c64..3e79c22b7e5b8885bb0e24ffa22944eaec0c8d2e 100644 --- a/model.safetensors +++ b/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:964574ee2496392ed8546d35dde6ee1c393d9db5c163a62154e223c6dc7aa7c8 -size 176343496 +oid sha256:20d550b61dd9790a42142014e712c6d94de2be31139ec39dcbbf926a20d95619 +size 617130824 diff --git a/train.py b/train.py index 4e5426c4b4ef11a4af06f2e6ad7645ac8bceb70d..9a03ddcf1e6029db68346a10b2aaf493e961a21b 100644 --- a/train.py +++ b/train.py @@ -46,10 +46,10 @@ train_dataset = load_dataset("train_data.txt", hf_tokenizer) # Step 3: Define GPT2 config for a tiny model config = GPT2Config( vocab_size=hf_tokenizer.vocab_size, - n_positions=1024, - n_ctx=1024, - n_embd=768, - n_layer=6, + n_positions=2048, + n_ctx=2048, + n_embd=1024, + n_layer=12, n_head=2, bos_token_id=hf_tokenizer.bos_token_id, eos_token_id=hf_tokenizer.eos_token_id, diff --git a/training_args.bin b/training_args.bin index 0403c28e41ead56dc065860ac748f500b2df575a..6dcc5a1a1def25e4256c25ec89c109edb25731fb 100644 --- a/training_args.bin +++ b/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7935e3b7629485cf26f624afe618b58a390d4c3c9c6dcf878b3ba9099ef1027b +oid sha256:d5c04ce53d557cd762ff77607aaadbd1248aaa177697662d9a21b90d444993b7 size 5304