File size: 7,271 Bytes
7934b29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
name: megatron_virtual_prompt_gpt
trainer:
devices: 1
accelerator: gpu
num_nodes: 1
precision: 16
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: 3 # min 25 recommended
max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10 # frequency with which training steps are logged
val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
gradient_clip_val: 1.0
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
benchmark: False
exp_manager:
explicit_log_dir: null
exp_dir: null
name: ${name}
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 2
mode: min
save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below,
filename: 'megatron_gpt_prompt_tune--{val_loss:.3f}-{step}'
model_parallel_size: ${model.tensor_model_parallel_size}
save_best_model: True
create_early_stopping_callback: True
early_stopping_callback_params:
monitor: "val_loss"
mode: "min"
min_delta: 0.001
patience: 10
verbose: True
model:
seed: 1234
nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
virtual_prompt_style: 'p-tuning' # one of 'prompt-tuning', 'p-tuning', or 'inference'
tensor_model_parallel_size: 1 # intra-layer model parallelism
pipeline_model_parallel_size: 1 # inter-layer model parallelism
global_batch_size: 8
micro_batch_size: 4
validation_global_batch_size: ${model.global_batch_size}
validation_micro_batch_size: ${model.micro_batch_size}
validation_drop_last: False
restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
language_model_path: ??? # Path to the GPT language model .nemo file, always required
save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
existing_tasks: ['boolq', 'intent_and_slot'] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given
new_tasks: ['rte'] # List of new tasknames to be prompt-tuned
## Sequence Parallelism
# Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
sequence_parallel: False
## Activation Checkpoint
activations_checkpoint_granularity: null # 'selective' or 'full'
activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
# 'uniform' divides the total number of transformer layers and checkpoints the input activation
# of each chunk at the specified granularity
# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
activations_checkpoint_num_layers: null # not used with 'selective'
task_templates: # Add more/replace tasks as needed, these are just examples
- taskname: "boolq" # The task name
prompt_template: "<|VIRTUAL_PROMPT_0|> Passage: {passage} <|VIRTUAL_PROMPT_1|> \nQuestion: {question} \nAnswer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|>
total_virtual_tokens: 30 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time.
virtual_token_splits: [20, 10] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens
truncate_field: "passage" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped.
answer_only_loss: True
answer_field: "answer"
- taskname: "intent_and_slot"
prompt_template: "<|VIRTUAL_PROMPT_0|> intent options: {intent_options} <|VIRTUAL_PROMPT_1|> slot options: {slot_options} <|VIRTUAL_PROMPT_2|> {utterance} \nintent: {intent} \nslot: {slot}"
total_virtual_tokens: 30
answer_only_loss: False
virtual_token_splits: [15, 10, 5]
truncate_field: null
- taskname: "rte"
prompt_template: "<|VIRTUAL_PROMPT_0|>{premise}\n{hypothesis}\nAnswer: {answer}"
total_virtual_tokens: 9
virtual_token_splits: [9]
truncate_field: null
answer_only_loss: True
answer_field: "answer"
- taskname: "squad"
prompt_template: "<|VIRTUAL_PROMPT_0|> context: {context} question: {question} answer: {answer}"
total_virtual_tokens: 10
virtual_token_splits: [10]
truncate_field: null
answer_only_loss: True
answer_field: "answer"
- taskname: "taskname"
prompt_template: "<|VIRTUAL_PROMPT_0|> {prompt} {completion}"
total_virtual_tokens: 100
virtual_token_splits: [100]
truncate_field: null
answer_only_loss: True
answer_field: "completion"
prompt_tuning: # Prompt tunin specific params
new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random
p_tuning: # P-tuning specific params
encoder_type: "tpmlp" # ['tpmlp', 'lstm', 'biglstm', 'mlp']
dropout: 0.0
num_layers: 2 # number of layers for MLP or LSTM layers. Note, it has no effect for tpmlp currently as it always assumes it is two layers.
encoder_hidden: 2048 # encoder hidden for biglstm and tpmlp
init_std: 0.023 # init std for tpmlp layers
data:
train_ds: [data/rte_train.jsonl,]
validation_ds: [data/rte_val.jsonl,]
add_eos: True
shuffle: True
num_workers: 8
pin_memory: True
train_cache_data_path: null # the path to the train cache data
validation_cache_data_path: null # the path to the validation cache data
test_cache_data_path: null # the path to the test cache data
load_cache: False # whether to load from the cache data
max_seq_length: 1024 # filter out training and validation examples longer than 1024 tokens. Set to None will default to model's encoder length.
min_seq_length: 1 # filter out training and validation examples less than 1 token long.
optim:
name: fused_adam
lr: 1e-4
weight_decay: 0.01
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 50
min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
constant_steps: 0 # Constant steps should also be 0 when min_lr=0
monitor: val_loss
reduce_on_plateau: false
|