NeMo / examples /nlp /language_modeling /conf /megatron_retro_finetune_config.yaml

thanks to NVIDIA ❤

7934b29 about 2 years ago

3.44 kB

	name: fine_tune_retro

	trainer:
	devices: 2
	num_nodes: 1
	accelerator: gpu
	precision: 16
	logger: False # logger provided by exp_manager
	enable_checkpointing: False
	replace_sampler_ddp: False
	max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
	max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
	log_every_n_steps: 10
	val_check_interval: 100
	limit_val_batches: null
	limit_test_batches: null
	accumulate_grad_batches: 1
	gradient_clip_val: 1.0

	exp_manager:
	explicit_log_dir: null
	exp_dir: null
	name: megatron_retro
	create_wandb_logger: False
	wandb_logger_kwargs:
	project: null
	name: null
	resume_if_exists: True
	resume_ignore_no_checkpoint: True
	create_checkpoint_callback: True
	checkpoint_callback_params:
	monitor: val_loss
	save_top_k: 10
	mode: min
	always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
	filename: 'megatron_retro--{val_loss:.2f}-{step}-{consumed_samples}'
	model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}


	model:
	# model parallelism
	tensor_model_parallel_size: 1
	pipeline_model_parallel_size: 1 # has to be one. not supporting pipeline parallel yet

	micro_batch_size: 4
	megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.

	tokenizer:
	library: 'megatron'
	type: 'GPT2BPETokenizer'
	model: null
	vocab_file: null
	merge_file: null
	delimiter: null # only used for tabular tokenizer

	gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
	# precision
	native_amp_init_scale: 4294967296 # 2 ** 32
	native_amp_growth_interval: 1000
	fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

	# miscellaneous
	seed: 1234

	restore_path: null # the retro model restore path

	data:
	train_ds:
	file_name: ??? # train data file path
	answer_only_loss: True # whether use answer only loss
	seq_length: 128 # must be multiple of the chunk_size in your dataset
	add_bos: True # whether to add bos at the beginning
	add_eos: True # whether to add eos at the end
	seed: 1234
	neighbors: 20 # number of retrieved neighbors
	val_ds:
	file_name: ??? # train data file path
	answer_only_loss: True # whether use answer only loss
	seq_length: 128 # must be multiple of the chunk_size in your dataset
	add_bos: True # whether to add bos at the beginning
	add_eos: True # whether to add eos at the end
	seed: 1234
	neighbors: 20 # number of retrieved neighbors
	test_ds:
	file_name: ??? # train data file path
	answer_only_loss: True # whether use answer only loss
	seq_length: 128 # must be multiple of the chunk_size in your dataset
	add_bos: True # whether to add bos at the beginning
	add_eos: True # whether to add eos at the end
	seed: 1234
	neighbors: 20 # number of retrieved neighbors


	optim:
	name: fused_adam
	lr: 1e-4
	weight_decay: 0.01
	betas:
	- 0.9
	- 0.98
	sched:
	name: CosineAnnealing
	warmup_steps: 500
	constant_steps: 50000
	min_lr: 1e-5