|
name: megatron_t5_lm_adaptation_finetune |
|
restore_from_path: null |
|
|
|
trainer: |
|
devices: 1 |
|
num_nodes: 1 |
|
accelerator: gpu |
|
precision: 16 |
|
logger: False |
|
enable_checkpointing: False |
|
replace_sampler_ddp: False |
|
max_epochs: -1 |
|
max_steps: 100000 |
|
log_every_n_steps: 10 |
|
val_check_interval: 100 |
|
limit_val_batches: 50 |
|
limit_test_batches: 500 |
|
accumulate_grad_batches: 1 |
|
gradient_clip_val: 1.0 |
|
|
|
exp_manager: |
|
explicit_log_dir: null |
|
exp_dir: null |
|
name: megatron_t5_lm_adaptation_finetune |
|
create_wandb_logger: False |
|
wandb_logger_kwargs: |
|
project: null |
|
name: null |
|
resume_if_exists: True |
|
resume_ignore_no_checkpoint: True |
|
create_checkpoint_callback: True |
|
checkpoint_callback_params: |
|
monitor: val_loss |
|
save_top_k: 10 |
|
mode: min |
|
always_save_nemo: False |
|
filename: 'megatron_t5--{val_loss:.2f}-{step}-{consumed_samples}' |
|
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} |
|
|
|
model: |
|
|
|
pretrained_model_path: ??? |
|
|
|
|
|
micro_batch_size: 4 |
|
global_batch_size: 8 |
|
tensor_model_parallel_size: 2 |
|
pipeline_model_parallel_size: 1 |
|
resume_from_checkpoint: null |
|
pipeline_model_parallel_split_rank: 1 |
|
|
|
|
|
megatron_amp_O2: False |
|
|
|
|
|
bias_activation_fusion: True |
|
masked_softmax_fusion: True |
|
bias_dropout_add_fusion: True |
|
|
|
gradient_as_bucket_view: True |
|
|
|
|
|
hidden_dropout: null |
|
attention_dropout: null |
|
|
|
data: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_prefix: ??? |
|
index_mapping_dir: null |
|
data_impl: mmap |
|
splits_string: 949,45,5 |
|
seq_length: ${model.seq_length} |
|
seq_length_dec: 128 |
|
skip_warmup: True |
|
num_workers: 0 |
|
dataloader_type: single |
|
masked_lm_prob: 0.15 |
|
dataset_type: 't5_prefix_lm' |
|
short_seq_prob: 0.0 |
|
max_ngram_size: 10 |
|
mean_ngram_size: null |
|
geometric_dist: True |
|
permutation: False |
|
whole_word_masking: True |
|
favor_longer_ngrams: False |
|
|
|
optim: |
|
name: fused_adam |
|
lr: 5e-6 |
|
betas: |
|
- 0.9 |
|
- 0.999 |
|
eps: 1e-8 |
|
weight_decay: 0.01 |
|
|