|
|
|
name: &name PretrainingBERTFromPreprocessed |
|
trainer: |
|
devices: 8 |
|
num_nodes: 1 |
|
max_steps: 2285714 |
|
num_sanity_val_steps: 0 |
|
replace_sampler_ddp: false |
|
accumulate_grad_batches: 1 |
|
precision: 16 |
|
accelerator: gpu |
|
gradient_clip_val: 1.0 |
|
log_every_n_steps: 1 |
|
val_check_interval: 1.0 |
|
enable_checkpointing: False |
|
logger: false |
|
|
|
model: |
|
nemo_path: null |
|
only_mlm_loss: true |
|
num_tok_classification_layers: 1 |
|
num_seq_classification_layers: 2 |
|
|
|
|
|
language_model: |
|
pretrained_model_name: bert-base-uncased |
|
lm_checkpoint: null |
|
config: |
|
attention_probs_dropout_prob: 0.1 |
|
hidden_act: gelu |
|
hidden_dropout_prob: 0.1 |
|
hidden_size: 768 |
|
initializer_range: 0.02 |
|
intermediate_size: 3072 |
|
max_position_embeddings: 512 |
|
num_attention_heads: 12 |
|
num_hidden_layers: 12 |
|
type_vocab_size: 2 |
|
vocab_size: 30522 |
|
config_file: null |
|
|
|
tokenizer: null |
|
|
|
train_ds: |
|
data_file: ??? |
|
max_predictions_per_seq: 80 |
|
batch_size: 16 |
|
shuffle: true |
|
num_samples: -1 |
|
num_workers: 2 |
|
drop_last: false |
|
pin_memory: false |
|
|
|
optim: |
|
name: adamw |
|
lr: 0.4375e-4 |
|
weight_decay: 0.01 |
|
|
|
sched: |
|
name: SquareRootAnnealing |
|
warmup_steps: null |
|
warmup_ratio: 0.01 |
|
min_lr: 0.0 |
|
last_epoch: -1 |
|
|
|
|
|
exp_manager: |
|
exp_dir: null |
|
name: *name |
|
create_tensorboard_logger: True |
|
create_checkpoint_callback: True |
|
|
|
|
|
hydra: |
|
run: |
|
dir: . |
|
job_logging: |
|
root: |
|
handlers: null |
|
|