|
defaults: |
|
- .@model: megatron_model_base_config |
|
|
|
name: test_retro |
|
restore_from_path: null |
|
|
|
trainer: |
|
devices: 2 |
|
num_nodes: 1 |
|
accelerator: gpu |
|
precision: 16 |
|
logger: False |
|
enable_checkpointing: False |
|
replace_sampler_ddp: False |
|
max_epochs: -1 |
|
max_steps: 100000 |
|
log_every_n_steps: 10 |
|
val_check_interval: 100 |
|
limit_val_batches: null |
|
limit_test_batches: null |
|
accumulate_grad_batches: 1 |
|
gradient_clip_val: 1.0 |
|
|
|
exp_manager: |
|
explicit_log_dir: null |
|
exp_dir: null |
|
name: megatron_retro |
|
create_wandb_logger: False |
|
wandb_logger_kwargs: |
|
project: null |
|
name: null |
|
resume_if_exists: True |
|
resume_ignore_no_checkpoint: True |
|
create_checkpoint_callback: True |
|
checkpoint_callback_params: |
|
monitor: val_loss |
|
save_top_k: 10 |
|
mode: min |
|
always_save_nemo: False |
|
filename: 'megatron_retro--{val_loss:.2f}-{step}-{consumed_samples}' |
|
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} |
|
|
|
|
|
model: |
|
version: 1 |
|
|
|
|
|
micro_batch_size: 4 |
|
tensor_model_parallel_size: 1 |
|
pipeline_model_parallel_size: 1 |
|
|
|
|
|
encoder_seq_length: 2048 |
|
max_position_embeddings: ${.encoder_seq_length} |
|
|
|
gradient_as_bucket_view: True |
|
|
|
dump_debug_info: False |
|
dump_debug_info_to_file: False |
|
|
|
|
|
chunk_size: 64 |
|
enc_num_layers: 4 |
|
dec_num_layers: 6 |
|
enc_cross_attention: [3] |
|
dec_cross_attention: [3, 5] |
|
add_position_embedding: False |
|
|
|
make_vocab_size_divisible_by: 128 |
|
pre_process: True |
|
post_process: True |
|
bert_binary_head: True |
|
|
|
megatron_amp_O2: False |
|
grad_allreduce_chunk_size_mb: 125 |
|
|
|
megatron_lm_compatible: False |
|
|
|
tokenizer: |
|
library: 'megatron' |
|
type: 'GPT2BPETokenizer' |
|
model: null |
|
vocab_file: null |
|
merge_file: null |
|
delimiter: null |
|
|
|
|
|
native_amp_init_scale: 4294967296 |
|
native_amp_growth_interval: 1000 |
|
fp16_lm_cross_entropy: False |
|
|
|
|
|
seed: 1234 |
|
|
|
data: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_prefix: ??? |
|
knn_index: ??? |
|
retrieval_prefix: ??? |
|
index_mapping_dir: null |
|
data_impl: retmmap |
|
splits_string: 900,50,50 |
|
seq_length: ${model.encoder_seq_length} |
|
skip_warmup: True |
|
num_workers: 0 |
|
dataloader_type: single |
|
neighbors: 2 |
|
|
|
optim: |
|
name: fused_adam |
|
lr: 1e-4 |
|
weight_decay: 0.01 |
|
betas: |
|
- 0.9 |
|
- 0.98 |
|
sched: |
|
name: CosineAnnealing |
|
warmup_steps: 500 |
|
constant_steps: 50000 |
|
min_lr: 1e-5 |
|
|