NeMo / examples /nlp /language_modeling /conf /megatron_gpt_prompt_learning_config.yaml

thanks to NVIDIA ❤

7934b29 about 2 years ago

7.27 kB

	name: megatron_virtual_prompt_gpt

	trainer:
	devices: 1
	accelerator: gpu
	num_nodes: 1
	precision: 16
	logger: False # logger provided by exp_manager
	enable_checkpointing: False
	replace_sampler_ddp: False
	max_epochs: 3 # min 25 recommended
	max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
	log_every_n_steps: 10 # frequency with which training steps are logged
	val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
	gradient_clip_val: 1.0
	resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
	benchmark: False



	exp_manager:
	explicit_log_dir: null
	exp_dir: null
	name: ${name}
	create_wandb_logger: False
	wandb_logger_kwargs:
	project: null
	name: null
	resume_if_exists: True
	resume_ignore_no_checkpoint: True
	create_checkpoint_callback: True
	checkpoint_callback_params:
	monitor: val_loss
	save_top_k: 2
	mode: min
	save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below,
	filename: 'megatron_gpt_prompt_tune--{val_loss:.3f}-{step}'
	model_parallel_size: ${model.tensor_model_parallel_size}
	save_best_model: True
	create_early_stopping_callback: True
	early_stopping_callback_params:
	monitor: "val_loss"
	mode: "min"
	min_delta: 0.001
	patience: 10
	verbose: True


	model:
	seed: 1234
	nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
	virtual_prompt_style: 'p-tuning' # one of 'prompt-tuning', 'p-tuning', or 'inference'
	tensor_model_parallel_size: 1 # intra-layer model parallelism
	pipeline_model_parallel_size: 1 # inter-layer model parallelism
	global_batch_size: 8
	micro_batch_size: 4
	validation_global_batch_size: ${model.global_batch_size}
	validation_micro_batch_size: ${model.micro_batch_size}
	validation_drop_last: False

	restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
	language_model_path: ??? # Path to the GPT language model .nemo file, always required
	save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
	existing_tasks: ['boolq', 'intent_and_slot'] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given
	new_tasks: ['rte'] # List of new tasknames to be prompt-tuned



	## Sequence Parallelism
	# Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
	# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
	sequence_parallel: False

	## Activation Checkpoint
	activations_checkpoint_granularity: null # 'selective' or 'full'
	activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
	# 'uniform' divides the total number of transformer layers and checkpoints the input activation
	# of each chunk at the specified granularity
	# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
	activations_checkpoint_num_layers: null # not used with 'selective'

	task_templates: # Add more/replace tasks as needed, these are just examples
	- taskname: "boolq" # The task name
	prompt_template: "<\|VIRTUAL_PROMPT_0\|> Passage: {passage} <\|VIRTUAL_PROMPT_1\|> \nQuestion: {question} \nAnswer: {answer}" # Prompt template for task, specify virtual prompt positions with <\|VIRTUAL_PROMPT_#\|>
	total_virtual_tokens: 30 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time.
	virtual_token_splits: [20, 10] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens
	truncate_field: "passage" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped.
	answer_only_loss: True
	answer_field: "answer"

	- taskname: "intent_and_slot"
	prompt_template: "<\|VIRTUAL_PROMPT_0\|> intent options: {intent_options} <\|VIRTUAL_PROMPT_1\|> slot options: {slot_options} <\|VIRTUAL_PROMPT_2\|> {utterance} \nintent: {intent} \nslot: {slot}"
	total_virtual_tokens: 30
	answer_only_loss: False
	virtual_token_splits: [15, 10, 5]
	truncate_field: null

	- taskname: "rte"
	prompt_template: "<\|VIRTUAL_PROMPT_0\|>{premise}\n{hypothesis}\nAnswer: {answer}"
	total_virtual_tokens: 9
	virtual_token_splits: [9]
	truncate_field: null
	answer_only_loss: True
	answer_field: "answer"

	- taskname: "squad"
	prompt_template: "<\|VIRTUAL_PROMPT_0\|> context: {context} question: {question} answer: {answer}"
	total_virtual_tokens: 10
	virtual_token_splits: [10]
	truncate_field: null
	answer_only_loss: True
	answer_field: "answer"

	- taskname: "taskname"
	prompt_template: "<\|VIRTUAL_PROMPT_0\|> {prompt} {completion}"
	total_virtual_tokens: 100
	virtual_token_splits: [100]
	truncate_field: null
	answer_only_loss: True
	answer_field: "completion"

	prompt_tuning: # Prompt tunin specific params
	new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
	new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random

	p_tuning: # P-tuning specific params
	encoder_type: "tpmlp" # ['tpmlp', 'lstm', 'biglstm', 'mlp']
	dropout: 0.0
	num_layers: 2 # number of layers for MLP or LSTM layers. Note, it has no effect for tpmlp currently as it always assumes it is two layers.
	encoder_hidden: 2048 # encoder hidden for biglstm and tpmlp
	init_std: 0.023 # init std for tpmlp layers

	data:
	train_ds: [data/rte_train.jsonl,]
	validation_ds: [data/rte_val.jsonl,]
	add_eos: True
	shuffle: True
	num_workers: 8
	pin_memory: True
	train_cache_data_path: null # the path to the train cache data
	validation_cache_data_path: null # the path to the validation cache data
	test_cache_data_path: null # the path to the test cache data
	load_cache: False # whether to load from the cache data
	max_seq_length: 1024 # filter out training and validation examples longer than 1024 tokens. Set to None will default to model's encoder length.
	min_seq_length: 1 # filter out training and validation examples less than 1 token long.


	optim:
	name: fused_adam
	lr: 1e-4
	weight_decay: 0.01
	betas:
	- 0.9
	- 0.98
	sched:
	name: CosineAnnealing
	warmup_steps: 50
	min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
	constant_steps: 0 # Constant steps should also be 0 when min_lr=0
	monitor: val_loss
	reduce_on_plateau: false