pretrain core 4
Browse files
scripts/pretrain_core_model_4.yaml
CHANGED
@@ -60,7 +60,7 @@ train:
|
|
60 |
# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
|
61 |
# global_batch_size: 512
|
62 |
# global_batch_size: 256
|
63 |
-
global_batch_size:
|
64 |
|
65 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
66 |
micro_batch_size: 1
|
@@ -124,31 +124,15 @@ eval:
|
|
124 |
# - 0.9
|
125 |
# - 0.999
|
126 |
|
127 |
-
# optimizer:
|
128 |
-
# class_path: sophia_opt.SophiaG
|
129 |
-
# init_args:
|
130 |
-
# lr: 1e-4
|
131 |
-
# betas:
|
132 |
-
# - 0.9
|
133 |
-
# - 0.95
|
134 |
-
# rho: 0.05
|
135 |
-
# weight_decay: 0.1
|
136 |
-
|
137 |
optimizer:
|
138 |
-
|
139 |
-
# class_path: torchao.prototype.low_bit_optim.AdamW8bit
|
140 |
-
# class_path: torchao.prototype.low_bit_optim.AdamW4bit
|
141 |
-
# class_path: bitsandbytes.optim.AdamW8bit
|
142 |
-
class_path: bitsandbytes.optim.PagedAdamW8bit
|
143 |
init_args:
|
144 |
-
# (type: float, default: 0.001)
|
145 |
lr: 1e-4
|
146 |
-
# (type: float, default: 0.01)
|
147 |
-
weight_decay: 0.01
|
148 |
-
# (type: tuple, default: (0.9,0.999))
|
149 |
betas:
|
150 |
- 0.9
|
151 |
-
- 0.
|
|
|
|
|
152 |
|
153 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
154 |
devices: auto
|
|
|
60 |
# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
|
61 |
# global_batch_size: 512
|
62 |
# global_batch_size: 256
|
63 |
+
global_batch_size: 32
|
64 |
|
65 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
66 |
micro_batch_size: 1
|
|
|
124 |
# - 0.9
|
125 |
# - 0.999
|
126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
optimizer:
|
128 |
+
class_path: sophia_opt.SophiaG
|
|
|
|
|
|
|
|
|
129 |
init_args:
|
|
|
130 |
lr: 1e-4
|
|
|
|
|
|
|
131 |
betas:
|
132 |
- 0.9
|
133 |
+
- 0.95
|
134 |
+
rho: 0.05
|
135 |
+
weight_decay: 0.1
|
136 |
|
137 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
138 |
devices: auto
|