NeMo / examples /nlp /language_modeling /conf /megatron_retro_finetune_config.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
name: fine_tune_retro
trainer:
devices: 2
num_nodes: 1
accelerator: gpu
precision: 16
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 100
limit_val_batches: null
limit_test_batches: null
accumulate_grad_batches: 1
gradient_clip_val: 1.0
exp_manager:
explicit_log_dir: null
exp_dir: null
name: megatron_retro
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
filename: 'megatron_retro--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
model:
# model parallelism
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1 # has to be one. not supporting pipeline parallel yet
micro_batch_size: 4
megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
tokenizer:
library: 'megatron'
type: 'GPT2BPETokenizer'
model: null
vocab_file: null
merge_file: null
delimiter: null # only used for tabular tokenizer
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
# miscellaneous
seed: 1234
restore_path: null # the retro model restore path
data:
train_ds:
file_name: ??? # train data file path
answer_only_loss: True # whether use answer only loss
seq_length: 128 # must be multiple of the chunk_size in your dataset
add_bos: True # whether to add bos at the beginning
add_eos: True # whether to add eos at the end
seed: 1234
neighbors: 20 # number of retrieved neighbors
val_ds:
file_name: ??? # train data file path
answer_only_loss: True # whether use answer only loss
seq_length: 128 # must be multiple of the chunk_size in your dataset
add_bos: True # whether to add bos at the beginning
add_eos: True # whether to add eos at the end
seed: 1234
neighbors: 20 # number of retrieved neighbors
test_ds:
file_name: ??? # train data file path
answer_only_loss: True # whether use answer only loss
seq_length: 128 # must be multiple of the chunk_size in your dataset
add_bos: True # whether to add bos at the beginning
add_eos: True # whether to add eos at the end
seed: 1234
neighbors: 20 # number of retrieved neighbors
optim:
name: fused_adam
lr: 1e-4
weight_decay: 0.01
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 500
constant_steps: 50000
min_lr: 1e-5