pretrain core 4
Browse files
scripts/pretrain_core_model_4.yaml
CHANGED
@@ -124,15 +124,31 @@ eval:
|
|
124 |
# - 0.9
|
125 |
# - 0.999
|
126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
optimizer:
|
128 |
-
class_path:
|
|
|
|
|
|
|
|
|
129 |
init_args:
|
|
|
130 |
lr: 1e-4
|
|
|
|
|
|
|
131 |
betas:
|
132 |
- 0.9
|
133 |
-
- 0.
|
134 |
-
rho: 0.05
|
135 |
-
weight_decay: 0.1
|
136 |
|
137 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
138 |
devices: auto
|
|
|
124 |
# - 0.9
|
125 |
# - 0.999
|
126 |
|
127 |
+
# optimizer:
|
128 |
+
# class_path: sophia_opt.SophiaG
|
129 |
+
# init_args:
|
130 |
+
# lr: 1e-4
|
131 |
+
# betas:
|
132 |
+
# - 0.9
|
133 |
+
# - 0.95
|
134 |
+
# rho: 0.05
|
135 |
+
# weight_decay: 0.1
|
136 |
+
|
137 |
optimizer:
|
138 |
+
# class_path: torch.optim.AdamW
|
139 |
+
class_path: torchao.prototype.low_bit_optim.AdamW8bit
|
140 |
+
# class_path: torchao.prototype.low_bit_optim.AdamW4bit
|
141 |
+
# class_path: bitsandbytes.optim.AdamW8bit
|
142 |
+
# class_path: bitsandbytes.optim.PagedAdamW8bit
|
143 |
init_args:
|
144 |
+
# (type: float, default: 0.001)
|
145 |
lr: 1e-4
|
146 |
+
# (type: float, default: 0.01)
|
147 |
+
weight_decay: 0.01
|
148 |
+
# (type: tuple, default: (0.9,0.999))
|
149 |
betas:
|
150 |
- 0.9
|
151 |
+
- 0.999
|
|
|
|
|
152 |
|
153 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
154 |
devices: auto
|