NeMo / examples /nlp /language_modeling /conf /bert_pretraining_from_text_config.yaml

thanks to NVIDIA ❤

7934b29 about 2 years ago

3.42 kB

	# BERT Pretraining from Text
	name: &name PretrainingBERTFromText
	trainer:
	devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices
	num_nodes: 1
	max_epochs: 2 # the number of training epochs
	max_steps: -1 # precedence over max_epochs
	accumulate_grad_batches: 1 # accumulates grads every k batches
	precision: 16 # 16 to use AMP
	accelerator: gpu
	gradient_clip_val: 0.0
	log_every_n_steps: 1
	val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
	enable_checkpointing: False # provided by exp_manager
	logger: false # provided by exp_manager

	model:
	nemo_path: null # exported .nemo path
	only_mlm_loss: false # only use masked language model without next sentence prediction
	num_tok_classification_layers: 1 # number of token classification head output layers
	num_seq_classification_layers: 2 # number of sequence classification head output layers
	max_seq_length: 128
	# The maximum total input sequence length after tokenization. Sequences longer than this
	# will be truncated, and sequences shorter than this will be padded.
	mask_prob: 0.15
	# Probability of masking a token in the input text during data processing.
	short_seq_prob: 0.1
	# Probability of having a sequence shorter than the maximum sequence length `max_seq_length` in data processing.",

	language_model:
	pretrained_model_name: bert-base-uncased
	lm_checkpoint: null
	config:
	attention_probs_dropout_prob: 0.1
	hidden_act: gelu
	hidden_dropout_prob: 0.1
	hidden_size: 768
	initializer_range: 0.02
	intermediate_size: 3072
	max_position_embeddings: 512
	num_attention_heads: 12
	num_hidden_layers: 12
	type_vocab_size: 2
	vocab_size: 30522
	config_file: null # json file, precedence over config

	tokenizer:
	tokenizer_name: ${model.language_model.pretrained_model_name} # tokenizer that inherits from TokenizerSpec
	vocab_file: null # path to vocab file
	tokenizer_model: null # tokenizer model for sentencepiece
	special_tokens: # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently.
	unk_token: '[UNK]'
	sep_token: '[SEP]'
	pad_token: '[PAD]'
	bos_token: '[CLS]'
	mask_token: '[MASK]'
	eos_token: '[SEP]'
	cls_token: '[CLS]'

	train_ds:
	data_file: ??? # path to data file
	max_seq_length: ${model.max_seq_length}
	mask_prob: ${model.mask_prob}
	short_seq_prob: ${model.short_seq_prob}
	batch_size: 16 # per GPU
	shuffle: true
	num_samples: -1
	num_workers: 2
	drop_last: false
	pin_memory: false

	validation_ds:
	data_file: ??? # path to data file
	max_seq_length: ${model.max_seq_length}
	mask_prob: ${model.mask_prob}
	short_seq_prob: ${model.short_seq_prob}
	batch_size: 16 # per GPU
	shuffle: false
	num_samples: -1
	num_workers: 2
	drop_last: false
	pin_memory: false

	optim:
	name: adamw
	lr: 3e-5
	weight_decay: 0.0

	sched:
	name: CosineAnnealing
	warmup_steps: null
	warmup_ratio: 0.1
	min_lr: 0.0
	last_epoch: -1


	exp_manager:
	exp_dir: null # where to store logs and checkpoints
	name: *name # name of experiment
	create_tensorboard_logger: True
	create_checkpoint_callback: True

	hydra:
	run:
	dir: .
	job_logging:
	root:
	handlers: null