NeMo / examples /nlp /language_modeling /conf /megatron_gpt_prompt_learning_config.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
name: megatron_virtual_prompt_gpt
trainer:
devices: 1
accelerator: gpu
num_nodes: 1
precision: 16
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: 3 # min 25 recommended
max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10 # frequency with which training steps are logged
val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
gradient_clip_val: 1.0
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
benchmark: False
exp_manager:
explicit_log_dir: null
exp_dir: null
name: ${name}
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 2
mode: min
save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below,
filename: 'megatron_gpt_prompt_tune--{val_loss:.3f}-{step}'
model_parallel_size: ${model.tensor_model_parallel_size}
save_best_model: True
create_early_stopping_callback: True
early_stopping_callback_params:
monitor: "val_loss"
mode: "min"
min_delta: 0.001
patience: 10
verbose: True
model:
seed: 1234
nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
virtual_prompt_style: 'p-tuning' # one of 'prompt-tuning', 'p-tuning', or 'inference'
tensor_model_parallel_size: 1 # intra-layer model parallelism
pipeline_model_parallel_size: 1 # inter-layer model parallelism
global_batch_size: 8
micro_batch_size: 4
validation_global_batch_size: ${model.global_batch_size}
validation_micro_batch_size: ${model.micro_batch_size}
validation_drop_last: False
restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
language_model_path: ??? # Path to the GPT language model .nemo file, always required
save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
existing_tasks: ['boolq', 'intent_and_slot'] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given
new_tasks: ['rte'] # List of new tasknames to be prompt-tuned
## Sequence Parallelism
# Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
sequence_parallel: False
## Activation Checkpoint
activations_checkpoint_granularity: null # 'selective' or 'full'
activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
# 'uniform' divides the total number of transformer layers and checkpoints the input activation
# of each chunk at the specified granularity
# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
activations_checkpoint_num_layers: null # not used with 'selective'
task_templates: # Add more/replace tasks as needed, these are just examples
- taskname: "boolq" # The task name
prompt_template: "<|VIRTUAL_PROMPT_0|> Passage: {passage} <|VIRTUAL_PROMPT_1|> \nQuestion: {question} \nAnswer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|>
total_virtual_tokens: 30 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time.
virtual_token_splits: [20, 10] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens
truncate_field: "passage" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped.
answer_only_loss: True
answer_field: "answer"
- taskname: "intent_and_slot"
prompt_template: "<|VIRTUAL_PROMPT_0|> intent options: {intent_options} <|VIRTUAL_PROMPT_1|> slot options: {slot_options} <|VIRTUAL_PROMPT_2|> {utterance} \nintent: {intent} \nslot: {slot}"
total_virtual_tokens: 30
answer_only_loss: False
virtual_token_splits: [15, 10, 5]
truncate_field: null
- taskname: "rte"
prompt_template: "<|VIRTUAL_PROMPT_0|>{premise}\n{hypothesis}\nAnswer: {answer}"
total_virtual_tokens: 9
virtual_token_splits: [9]
truncate_field: null
answer_only_loss: True
answer_field: "answer"
- taskname: "squad"
prompt_template: "<|VIRTUAL_PROMPT_0|> context: {context} question: {question} answer: {answer}"
total_virtual_tokens: 10
virtual_token_splits: [10]
truncate_field: null
answer_only_loss: True
answer_field: "answer"
- taskname: "taskname"
prompt_template: "<|VIRTUAL_PROMPT_0|> {prompt} {completion}"
total_virtual_tokens: 100
virtual_token_splits: [100]
truncate_field: null
answer_only_loss: True
answer_field: "completion"
prompt_tuning: # Prompt tunin specific params
new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random
p_tuning: # P-tuning specific params
encoder_type: "tpmlp" # ['tpmlp', 'lstm', 'biglstm', 'mlp']
dropout: 0.0
num_layers: 2 # number of layers for MLP or LSTM layers. Note, it has no effect for tpmlp currently as it always assumes it is two layers.
encoder_hidden: 2048 # encoder hidden for biglstm and tpmlp
init_std: 0.023 # init std for tpmlp layers
data:
train_ds: [data/rte_train.jsonl,]
validation_ds: [data/rte_val.jsonl,]
add_eos: True
shuffle: True
num_workers: 8
pin_memory: True
train_cache_data_path: null # the path to the train cache data
validation_cache_data_path: null # the path to the validation cache data
test_cache_data_path: null # the path to the test cache data
load_cache: False # whether to load from the cache data
max_seq_length: 1024 # filter out training and validation examples longer than 1024 tokens. Set to None will default to model's encoder length.
min_seq_length: 1 # filter out training and validation examples less than 1 token long.
optim:
name: fused_adam
lr: 1e-4
weight_decay: 0.01
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 50
min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
constant_steps: 0 # Constant steps should also be 0 when min_lr=0
monitor: val_loss
reduce_on_plateau: false