micro_batch_size: 4
Browse files- README.md +25 -0
- scripts/pretrain-core-model.yaml +2 -1
README.md
CHANGED
@@ -65,6 +65,31 @@ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable
|
|
65 |
```
|
66 |
|
67 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
# ...
|
69 |
```
|
70 |
|
|
|
65 |
```
|
66 |
|
67 |
```
|
68 |
+
Seed set to 23
|
69 |
+
Time to instantiate model: 0.23 seconds.
|
70 |
+
Total parameters: 138,084,864
|
71 |
+
Verifying settings ...
|
72 |
+
Measured TFLOPs: 6972.54
|
73 |
+
Epoch 1 | iter 256 step 1 | loss train: 10.530, val: n/a | iter time: 3230.47 ms (step) remaining time: 3 days, 5:34:13
|
74 |
+
Epoch 1 | iter 512 step 2 | loss train: 10.520, val: n/a | iter time: 589.19 ms (step) remaining time: 3 days, 0:40:40
|
75 |
+
Epoch 1 | iter 768 step 3 | loss train: 10.485, val: n/a | iter time: 591.81 ms (step) remaining time: 2 days, 23:01:54
|
76 |
+
Epoch 1 | iter 1024 step 4 | loss train: 10.447, val: n/a | iter time: 589.35 ms (step) remaining time: 2 days, 22:11:32
|
77 |
+
Epoch 1 | iter 1280 step 5 | loss train: 10.350, val: n/a | iter time: 589.38 ms (step) remaining time: 2 days, 21:40:13
|
78 |
+
Epoch 1 | iter 1536 step 6 | loss train: 10.241, val: n/a | iter time: 593.75 ms (step) remaining time: 2 days, 21:18:19
|
79 |
+
Epoch 1 | iter 1792 step 7 | loss train: 10.134, val: n/a | iter time: 592.92 ms (step) remaining time: 2 days, 21:01:58
|
80 |
+
Epoch 1 | iter 2048 step 8 | loss train: 10.049, val: n/a | iter time: 590.74 ms (step) remaining time: 2 days, 20:49:12
|
81 |
+
Epoch 1 | iter 2304 step 9 | loss train: 9.869, val: n/a | iter time: 594.27 ms (step) remaining time: 2 days, 20:39:10
|
82 |
+
Epoch 1 | iter 2560 step 10 | loss train: 9.771, val: n/a | iter time: 590.04 ms (step) remaining time: 2 days, 20:30:14
|
83 |
+
Epoch 1 | iter 2816 step 11 | loss train: 9.643, val: n/a | iter time: 588.32 ms (step) remaining time: 2 days, 20:22:22
|
84 |
+
Epoch 1 | iter 3072 step 12 | loss train: 9.557, val: n/a | iter time: 588.95 ms (step) remaining time: 2 days, 20:15:26
|
85 |
+
Epoch 1 | iter 3328 step 13 | loss train: 9.487, val: n/a | iter time: 589.32 ms (step) remaining time: 2 days, 20:09:05
|
86 |
+
Epoch 1 | iter 3584 step 14 | loss train: 9.413, val: n/a | iter time: 588.95 ms (step) remaining time: 2 days, 20:03:24
|
87 |
+
Epoch 1 | iter 3840 step 15 | loss train: 9.322, val: n/a | iter time: 591.62 ms (step) remaining time: 2 days, 19:58:18
|
88 |
+
Epoch 1 | iter 4096 step 16 | loss train: 9.241, val: n/a | iter time: 593.65 ms (step) remaining time: 2 days, 19:53:30
|
89 |
+
Epoch 1 | iter 4352 step 17 | loss train: 9.163, val: n/a | iter time: 593.89 ms (step) remaining time: 2 days, 19:49:00
|
90 |
+
Epoch 1 | iter 4608 step 18 | loss train: 9.122, val: n/a | iter time: 590.63 ms (step) remaining time: 2 days, 19:44:42
|
91 |
+
Epoch 1 | iter 4864 step 19 | loss train: 9.077, val: n/a | iter time: 590.87 ms (step) remaining time: 2 days, 19:40:47
|
92 |
+
Epoch 1 | iter 5120 step 20 | loss train: 9.018, val: n/a | iter time: 588.44 ms (step) remaining time: 2 days, 19:36:59
|
93 |
# ...
|
94 |
```
|
95 |
|
scripts/pretrain-core-model.yaml
CHANGED
@@ -67,7 +67,8 @@ train:
|
|
67 |
# global_batch_size: 256
|
68 |
|
69 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
70 |
-
micro_batch_size:
|
|
|
71 |
# micro_batch_size: 1
|
72 |
|
73 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
|
|
67 |
# global_batch_size: 256
|
68 |
|
69 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
70 |
+
micro_batch_size: 4
|
71 |
+
# micro_batch_size: 2
|
72 |
# micro_batch_size: 1
|
73 |
|
74 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|