NeMo / examples /nlp /language_modeling /conf /bert_pretraining_from_text_config.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
# BERT Pretraining from Text
name: &name PretrainingBERTFromText
trainer:
devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices
num_nodes: 1
max_epochs: 2 # the number of training epochs
max_steps: -1 # precedence over max_epochs
accumulate_grad_batches: 1 # accumulates grads every k batches
precision: 16 # 16 to use AMP
accelerator: gpu
gradient_clip_val: 0.0
log_every_n_steps: 1
val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
enable_checkpointing: False # provided by exp_manager
logger: false # provided by exp_manager
model:
nemo_path: null # exported .nemo path
only_mlm_loss: false # only use masked language model without next sentence prediction
num_tok_classification_layers: 1 # number of token classification head output layers
num_seq_classification_layers: 2 # number of sequence classification head output layers
max_seq_length: 128
# The maximum total input sequence length after tokenization. Sequences longer than this
# will be truncated, and sequences shorter than this will be padded.
mask_prob: 0.15
# Probability of masking a token in the input text during data processing.
short_seq_prob: 0.1
# Probability of having a sequence shorter than the maximum sequence length `max_seq_length` in data processing.",
language_model:
pretrained_model_name: bert-base-uncased
lm_checkpoint: null
config:
attention_probs_dropout_prob: 0.1
hidden_act: gelu
hidden_dropout_prob: 0.1
hidden_size: 768
initializer_range: 0.02
intermediate_size: 3072
max_position_embeddings: 512
num_attention_heads: 12
num_hidden_layers: 12
type_vocab_size: 2
vocab_size: 30522
config_file: null # json file, precedence over config
tokenizer:
tokenizer_name: ${model.language_model.pretrained_model_name} # tokenizer that inherits from TokenizerSpec
vocab_file: null # path to vocab file
tokenizer_model: null # tokenizer model for sentencepiece
special_tokens: # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently.
unk_token: '[UNK]'
sep_token: '[SEP]'
pad_token: '[PAD]'
bos_token: '[CLS]'
mask_token: '[MASK]'
eos_token: '[SEP]'
cls_token: '[CLS]'
train_ds:
data_file: ??? # path to data file
max_seq_length: ${model.max_seq_length}
mask_prob: ${model.mask_prob}
short_seq_prob: ${model.short_seq_prob}
batch_size: 16 # per GPU
shuffle: true
num_samples: -1
num_workers: 2
drop_last: false
pin_memory: false
validation_ds:
data_file: ??? # path to data file
max_seq_length: ${model.max_seq_length}
mask_prob: ${model.mask_prob}
short_seq_prob: ${model.short_seq_prob}
batch_size: 16 # per GPU
shuffle: false
num_samples: -1
num_workers: 2
drop_last: false
pin_memory: false
optim:
name: adamw
lr: 3e-5
weight_decay: 0.0
sched:
name: CosineAnnealing
warmup_steps: null
warmup_ratio: 0.1
min_lr: 0.0
last_epoch: -1
exp_manager:
exp_dir: null # where to store logs and checkpoints
name: *name # name of experiment
create_tensorboard_logger: True
create_checkpoint_callback: True
hydra:
run:
dir: .
job_logging:
root:
handlers: null