NeMo / examples /nlp /language_modeling /conf /bert_pretraining_from_preprocessed_config.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
# BERT Pretraining from Preprocessed (tokenized) data
name: &name PretrainingBERTFromPreprocessed
trainer:
devices: 8 # the number of gpus, 0 for CPU, or list with gpu indices
num_nodes: 1
max_steps: 2285714 # precedence over max_epochs
num_sanity_val_steps: 0 # needed for bert pretraining from preproc
replace_sampler_ddp: false # needed for bert pretraining from preproc
accumulate_grad_batches: 1 # accumulates grads every k batches
precision: 16 # 16 to use AMP
accelerator: gpu
gradient_clip_val: 1.0
log_every_n_steps: 1
val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
enable_checkpointing: False # provided by exp_manager
logger: false # provided by exp_manager
model:
nemo_path: null # exported .nemo path
only_mlm_loss: true # only use masked language model without next sentence prediction
num_tok_classification_layers: 1 # number of token classification head output layers
num_seq_classification_layers: 2 # number of sequence classification head output layers
language_model:
pretrained_model_name: bert-base-uncased # huggingface model name
lm_checkpoint: null
config:
attention_probs_dropout_prob: 0.1
hidden_act: gelu
hidden_dropout_prob: 0.1
hidden_size: 768
initializer_range: 0.02
intermediate_size: 3072
max_position_embeddings: 512
num_attention_heads: 12
num_hidden_layers: 12
type_vocab_size: 2
vocab_size: 30522
config_file: null # json file, precedence over config
tokenizer: null
train_ds:
data_file: ??? # path to hdf5 file (or directory)
max_predictions_per_seq: 80
batch_size: 16
shuffle: true
num_samples: -1
num_workers: 2
drop_last: false
pin_memory: false
optim:
name: adamw
lr: 0.4375e-4
weight_decay: 0.01
sched:
name: SquareRootAnnealing
warmup_steps: null
warmup_ratio: 0.01
min_lr: 0.0
last_epoch: -1
exp_manager:
exp_dir: null # where to store logs and checkpoints
name: *name # name of experiment
create_tensorboard_logger: True
create_checkpoint_callback: True
hydra:
run:
dir: .
job_logging:
root:
handlers: null