File size: 5,999 Bytes
7934b29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
defaults:
  - [email protected]: megatron_model_base_config
  - [email protected]: megatron_model_base_config

name: megatron_t5
restore_from_path: null # used when starting from a .nemo file

trainer:
  devices: 1
  num_nodes: 1
  accelerator: gpu
  precision: 16
  logger: False # logger provided by exp_manager
  enable_checkpointing: False
  replace_sampler_ddp: False
  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
  log_every_n_steps: 10
  val_check_interval: 100
  limit_val_batches: 50
  limit_test_batches: 500
  accumulate_grad_batches: 1
  gradient_clip_val: 1.0
  benchmark: False

exp_manager:
  explicit_log_dir: null
  exp_dir: null
  name: ${name}
  create_wandb_logger: False
  wandb_logger_kwargs:
    project: null
    name: null
  resume_if_exists: True
  resume_ignore_no_checkpoint: True
  create_checkpoint_callback: True
  checkpoint_callback_params:
    monitor: val_loss
    save_top_k: 10
    mode: min
    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
    filename: '${name}--{val_loss:.2f}-{step}-{consumed_samples}'
    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}

model:
  # model parallelism 
  micro_batch_size: 4
  global_batch_size: 8 # will use more micro batches to reach global batch size
  tensor_model_parallel_size: 1
  pipeline_model_parallel_size: 1
  resume_from_checkpoint: null # manually set the checkpoint file to load from
  pipeline_model_parallel_split_rank: 0 # rank at which decoder starts.

  # model architecture
  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.

  megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
  grad_allreduce_chunk_size_mb: 125
  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

  seq_length: 512
  max_position_embeddings: ${.seq_length}

  tokenizer:
    library: 'megatron'
    type: 'BertWordPieceCase'
    model: null
    vocab_file: null
    merge_file: null
    num_sentinel_tokens: 100
    sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers.

  # weight init
  embedding_init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')

  # embedding dropout
  embedding_dropout: 0.1

  # embedding sharing
  share_token_embeddings: True # If True share encoder/decoder embeddings
  share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits

  # token head
  tokens_head_bias: True

  # precision
  native_amp_init_scale: 4294967296 # 2 ** 32
  native_amp_growth_interval: 1000
  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

  # miscellaneous
  seed: 1234
  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this

  data:
   # Path to data must be specified by the user.
    # Supports List, String and Dictionary
    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]",
    # Or see example below: 
    # data_prefix: 
    #   - .5
    #   - /raid/data/pile/my-t5_00_text_document
    #   - .5
    #   - /raid/data/pile/my-t5_01_text_document
    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
    # Or see example below:
     # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
    data_prefix: ???
    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
    data_impl: mmap # mmap, retmmap, text_mmap, csv_mmap
    # data_impl_kwargs: # currently used only for text_mmap, csv_mmap (should be data_impl dependant)
    #     # defaults for text_memmap
    #     newline_int: 10 # byte-value of newline (Use ord('\n') to get value)
    #     header_lines: 0 # skip first N header lines
    #     workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2)
    #     sort_dataset_paths: False # if True datasets will be sorted by name
    #     # defaults for csv_memmap
    #     newline_int: 10 # byte-value of newline
    #     header_lines: 1 # skip first N header lines
    #     workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2)
    #     sort_dataset_paths: False # if True datasets will be sorted by name
    #     data_col: 1 # column to use for data
    #     data_sep: ',' # string to split text into columns
    splits_string: 949,45,5
    seq_length: ${model.seq_length}
    seq_length_dec: 128
    skip_warmup: True
    num_workers: 0
    dataloader_type: single # cyclic
    masked_lm_prob: 0.15
    dataset_type: 't5'
    short_seq_prob: 0.0
    max_ngram_size: 10
    mean_ngram_size: null
    geometric_dist: True
    permutation: False
    whole_word_masking: True
    favor_longer_ngrams: False
    respect_document_boundaries: True # If true, a single training exampl cannot cross document boundaries, increasing the fraction of <pad> tokens within a batch.

  optim:
    name: fused_adam
    lr: 0.0001
    betas:
      - 0.9
      - 0.999
    eps: 1e-8
    weight_decay: 0.01
    sched:
      name: WarmupAnnealing
      min_lr: 0.00001
      last_epoch: -1
      warmup_ratio: 0.01