NeMo / examples /nlp /language_modeling /conf /megatron_t0_config.yaml

thanks to NVIDIA ❤

7934b29 about 2 years ago

3.85 kB

	name: megatron_t0

	trainer:
	devices: 1
	num_nodes: 1
	accelerator: gpu
	precision: 16
	logger: False # logger provided by exp_manager
	enable_checkpointing: False
	replace_sampler_ddp: False
	max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
	max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
	log_every_n_steps: 10
	val_check_interval: 300
	accumulate_grad_batches: 1
	gradient_clip_val: 1.0

	exp_manager:
	explicit_log_dir: null
	exp_dir: null
	name: megatron_t0
	create_wandb_logger: False
	wandb_logger_kwargs:
	project: null
	name: null
	resume_if_exists: True
	resume_ignore_no_checkpoint: True
	create_checkpoint_callback: True
	checkpoint_callback_params:
	monitor: validation_${model.data.validation_ds.metric.name}
	save_top_k: 10
	mode: max
	always_save_nemo: False # TODO: add support
	filename: 'megatron_t0--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
	model_parallel_size: ${model.tensor_model_parallel_size}
	save_best_model: True

	model:
	restore_from_path: null # Path to a trained T5 .nemo file
	pretrained_checkpoint:
	checkpoint_dir: null # Path to a folder that contains a .ckpt file
	checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
	hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
	tensor_model_parallel_size: 1
	pipeline_model_parallel_size: 1
	pipeline_model_parallel_split_rank: 0
	gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
	megatron_amp_O2: False # Enable O2 optimization for megatron amp
	resume_from_checkpoint: null
	hidden_dropout: 0.1 # Override dropout prob from pretraining
	attention_dropout: 0.1 # Override attention dropout prob from pretraining

	data:
	train_ds:
	file_names: ??? # Path to a list of JSONL files corresponding to the source data.
	global_batch_size: 128
	micro_batch_size: 16
	shuffle: True
	num_workers: 8
	pin_memory: True
	max_src_seq_length: 512
	max_tgt_seq_length: 512
	drop_last: True
	concat_sampling_probabilities: ??? # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
	replace_bos_with_pad: False # Replaces bos with pad for both the encoder and decoder. This is necessary when using Google's T5 checkpoints.
	add_bos_to_input: False # Adds bos to the input sequence.
	add_eos_to_input: False # Adds eos to the input sequence.
	seed: 1234

	validation_ds:
	file_names: ??? # Path to a list of JSONL files corresponding to the source data.
	names: null # Names of the corresponding datasets used to log metrics.
	global_batch_size: 16
	micro_batch_size: 16
	shuffle: False
	num_workers: 0
	pin_memory: True
	max_src_seq_length: 512
	max_tgt_seq_length: 512
	drop_last: False # TODO: Figure out if there is a way to avoid dropping last.
	write_predictions_to_file: False
	output_file_path_prefix: null # Prefix of the file to write predictions to.
	metric:
	name: "exact_string_match" # Name of the evaluation metric to use.
	average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
	num_classes: null
	replace_bos_with_pad: ${data.train_ds.replace_bos_with_pad}
	add_bos_to_input: ${data.train_ds.add_bos_to_input}
	add_eos_to_input: ${data.train_ds.add_eos_to_input}
	seed: 1234

	optim:
	name: fused_adam
	lr: 5e-6
	weight_decay: 0.0