NeMo / examples /nlp /language_modeling /conf /megatron_t5_lm_adaptation_finetune.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
name: megatron_t5_lm_adaptation_finetune
restore_from_path: null # used when starting from a .nemo file
trainer:
devices: 1
num_nodes: 1
accelerator: gpu
precision: 16
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 100
limit_val_batches: 50
limit_test_batches: 500
accumulate_grad_batches: 1
gradient_clip_val: 1.0
exp_manager:
explicit_log_dir: null
exp_dir: null
name: megatron_t5_lm_adaptation_finetune
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
filename: 'megatron_t5--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
model:
# pretrained model path
pretrained_model_path: ???
# model parallelism
micro_batch_size: 4
global_batch_size: 8 # will use more micro batches to reach global batch size
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 1
resume_from_checkpoint: null # manually set the checkpoint file to load from
pipeline_model_parallel_split_rank: 1
# O2 mixed precision
megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
# JIT fusion params.
bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
# Dropout
hidden_dropout: null
attention_dropout: null
data:
# Path to data must be specified by the user.
# can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]",
# Or see example below:
# data_prefix:
# - .5
# - /raid/data/pile/my-t5_00_text_document
# - .5
# - /raid/data/pile/my-t5_01_text_document
data_prefix: ???
index_mapping_dir: null
data_impl: mmap
splits_string: 949,45,5
seq_length: ${model.seq_length}
seq_length_dec: 128
skip_warmup: True
num_workers: 0
dataloader_type: single # cyclic
masked_lm_prob: 0.15
dataset_type: 't5_prefix_lm'
short_seq_prob: 0.0
max_ngram_size: 10
mean_ngram_size: null
geometric_dist: True
permutation: False
whole_word_masking: True
favor_longer_ngrams: False
optim:
name: fused_adam
lr: 5e-6
betas:
- 0.9
- 0.999
eps: 1e-8
weight_decay: 0.01