NeMo

File size: 7,271 Bytes

7934b29

name: megatron_virtual_prompt_gpt

trainer:
  devices: 1
  accelerator: gpu
  num_nodes: 1
  precision: 16
  logger: False # logger provided by exp_manager
  enable_checkpointing: False
  replace_sampler_ddp: False
  max_epochs: 3 # min 25 recommended
  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
  log_every_n_steps: 10 # frequency with which training steps are logged 
  val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
  gradient_clip_val: 1.0
  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
  benchmark: False
  


exp_manager:
  explicit_log_dir: null
  exp_dir: null
  name: ${name}
  create_wandb_logger: False
  wandb_logger_kwargs:
    project: null
    name: null
  resume_if_exists: True
  resume_ignore_no_checkpoint: True
  create_checkpoint_callback: True
  checkpoint_callback_params:
    monitor: val_loss
    save_top_k: 2
    mode: min
    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below, 
    filename: 'megatron_gpt_prompt_tune--{val_loss:.3f}-{step}'
    model_parallel_size: ${model.tensor_model_parallel_size}
    save_best_model: True
  create_early_stopping_callback: True
  early_stopping_callback_params:
    monitor: "val_loss"
    mode: "min"
    min_delta: 0.001
    patience: 10
    verbose: True
  

model:
  seed: 1234
  nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
  virtual_prompt_style: 'p-tuning' # one of 'prompt-tuning', 'p-tuning', or 'inference'
  tensor_model_parallel_size: 1 # intra-layer model parallelism
  pipeline_model_parallel_size: 1 # inter-layer model parallelism
  global_batch_size: 8
  micro_batch_size: 4
  validation_global_batch_size: ${model.global_batch_size}
  validation_micro_batch_size: ${model.micro_batch_size}
  validation_drop_last: False

  restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
  language_model_path: ??? # Path to the GPT language model .nemo file, always required
  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. 
  existing_tasks: ['boolq', 'intent_and_slot'] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given
  new_tasks: ['rte'] # List of new tasknames to be prompt-tuned
  


  ## Sequence Parallelism
  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
  sequence_parallel: False

  ## Activation Checkpoint 
  activations_checkpoint_granularity: null # 'selective' or 'full' 
  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
  # of each chunk at the specified granularity
  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
  activations_checkpoint_num_layers: null # not used with 'selective'

  task_templates: # Add more/replace tasks as needed, these are just examples
  - taskname: "boolq" # The task name
    prompt_template: "<|VIRTUAL_PROMPT_0|> Passage: {passage} <|VIRTUAL_PROMPT_1|> \nQuestion: {question} \nAnswer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|>
    total_virtual_tokens: 30 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time.
    virtual_token_splits: [20, 10] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens
    truncate_field: "passage" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped.
    answer_only_loss: True 
    answer_field: "answer"

  - taskname: "intent_and_slot"
    prompt_template: "<|VIRTUAL_PROMPT_0|> intent options: {intent_options} <|VIRTUAL_PROMPT_1|> slot options: {slot_options} <|VIRTUAL_PROMPT_2|> {utterance} \nintent: {intent} \nslot: {slot}"
    total_virtual_tokens: 30
    answer_only_loss: False 
    virtual_token_splits: [15, 10, 5]
    truncate_field: null

  - taskname: "rte" 
    prompt_template: "<|VIRTUAL_PROMPT_0|>{premise}\n{hypothesis}\nAnswer: {answer}" 
    total_virtual_tokens: 9 
    virtual_token_splits: [9] 
    truncate_field: null
    answer_only_loss: True
    answer_field: "answer"
  
  - taskname: "squad" 
    prompt_template: "<|VIRTUAL_PROMPT_0|> context: {context} question: {question} answer: {answer}" 
    total_virtual_tokens: 10
    virtual_token_splits: [10]
    truncate_field: null
    answer_only_loss: True
    answer_field: "answer"

  - taskname: "taskname"
    prompt_template: "<|VIRTUAL_PROMPT_0|> {prompt} {completion}"
    total_virtual_tokens: 100
    virtual_token_splits: [100]
    truncate_field: null
    answer_only_loss: True
    answer_field: "completion"

  prompt_tuning: # Prompt tunin specific params
    new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
    new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random

  p_tuning: # P-tuning specific params
    encoder_type: "tpmlp" # ['tpmlp', 'lstm', 'biglstm', 'mlp'] 
    dropout: 0.0
    num_layers: 2  # number of layers for MLP or LSTM layers. Note, it has no effect for tpmlp currently as it always assumes it is two layers.
    encoder_hidden: 2048 # encoder hidden for biglstm and tpmlp
    init_std: 0.023  # init std for tpmlp layers

  data:
    train_ds: [data/rte_train.jsonl,]
    validation_ds: [data/rte_val.jsonl,]
    add_eos: True
    shuffle: True
    num_workers: 8
    pin_memory: True
    train_cache_data_path: null  # the path to the train cache data 
    validation_cache_data_path: null  # the path to the validation cache data 
    test_cache_data_path: null  # the path to the test cache data 
    load_cache: False  # whether to load from the cache data
    max_seq_length: 1024  # filter out training and validation examples longer than 1024 tokens. Set to None will default to model's encoder length.
    min_seq_length: 1  # filter out training and validation examples less than 1 token long.


  optim:
    name: fused_adam
    lr: 1e-4
    weight_decay: 0.01 
    betas: 
    - 0.9
    - 0.98
    sched:
      name: CosineAnnealing
      warmup_steps: 50
      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
      monitor: val_loss
      reduce_on_plateau: false