NeMo / examples /asr /conf /ssl /wav2vec /wav2vec_ci.yaml

thanks to NVIDIA ❤

7934b29 about 2 years ago

5.71 kB

	# This config file contains parameters for training a Wav2Vec semi-supervised encoder
	# These parameters are based off the FairSeq implementation.
	# See here: https://github.com/pytorch/fairseq/blob/master/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml

	# kernel: \[([0-9]+)\]
	# filters: ([0-9]+)
	name: &name "Wav2vec_Pretrain"

	model:
	sample_rate: &sample_rate 16000
	feature_penalty: 0.0
	dropout_features: 0.1 # Dropout applied to inputs to context encoder
	dropout_features_q: 0.1 # Dropout applied to inputs to target quantizer
	embedding_dim: &emb_dim 768 # Project size of embedidng dimension for transformer
	final_dim: &final_dim 256 # Project final representations and targets to this dimension (target embeddings).

	train_ds:
	manifest_filepath: ???
	sample_rate: *sample_rate
	batch_size: 8
	trim_silence: false
	max_duration: 20.0
	min_duration: 8.0
	shuffle: true
	is_tarred: false
	tarred_audio_filepaths: null
	use_start_end_token: false
	num_workers: 8
	pin_memory: true
	# bucketing params
	bucketing_strategy: "synced_randomized"
	bucketing_batch_size: null

	validation_ds:
	manifest_filepath: ???
	sample_rate: *sample_rate
	batch_size: 8
	shuffle: false
	use_start_end_token: false
	num_workers: 8
	pin_memory: true
	max_duration: 20.0
	min_duration: 8.0

	preprocessor:
	_target_: nemo.collections.asr.modules.wav2vec_modules.ConvFeatureEncoder
	extractor_mode: layer_norm # Mode for feature extractor. [group_norm, layer_norm]
	conv_bias: False # Include bias in convolution feature extractor model
	feature_grad_mult: 1.0 # Multiply extracted feature gradients
	normalize_audio: true
	embedding_dim: *emb_dim # projected final depth of feature embeddings
	conv_layers:
	- emb_dim: 512
	kernel_size: 10
	stride: 5
	- emb_dim: 512
	kernel_size: 3
	stride: 2
	- emb_dim: 512
	kernel_size: 3
	stride: 2
	- emb_dim: 512
	kernel_size: 3
	stride: 2
	- emb_dim: 512
	kernel_size: 3
	stride: 2
	- emb_dim: 512
	kernel_size: 2
	stride: 2
	- emb_dim: 512
	kernel_size: 2
	stride: 2

	spec_augment:
	_target_: nemo.collections.asr.modules.MaskedPatchAugmentation
	freq_masks: 3
	freq_width: 20
	patch_size: 12
	mask_patches: 0.5

	encoder:
	_target_: nemo.collections.asr.modules.wav2vec_modules.Wav2VecTransformerEncoder
	layer_drop: 0.05
	pos_embed: # Config for convolutional model that generates positional embeddings required for attention layer
	embedding_dim: *emb_dim
	conv_pos: 128 # Number of filters for convolutional positional embeddings
	conv_pos_groups: 16 # Number of groups for convolutional positional embeddings
	transformer: # Config for nemo.collections.nlp.modules.common.transformer.TransformerEncoder
	num_layers: 6 # Number of encoder layers in transformer model
	hidden_size: *emb_dim # Encoder embedding dim
	inner_size: 1536 # Encoder embedding dim for feed forward
	num_attention_heads: 4 # Number of encoder attention heads
	attn_score_dropout: .1 # probability of dropout applied to attention scores
	attn_layer_dropout: .1 # probability of dropout applied to the output of the attention layers, but before layer normalization
	ffn_dropout: .1 # probability of dropout applied to FFN output
	hidden_act: gelu # Activation for transformer

	decoder:
	_target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction
	feat_in: *emb_dim
	feat_hidden: *emb_dim
	feat_out: *final_dim
	stride_layers: 0

	loss:
	_target_: nemo.collections.asr.losses.ContrastiveLoss
	in_dim: *emb_dim
	proj_dim: *final_dim
	quantized_targets: true # should quantizer or linear layer be used
	sample_from_same_utterance_only: true # should negatives be sampled only from the same utterance
	sample_from_non_masked: false # should negatives be sampled from non-masked steps

	optim:
	name: adamw
	lr: 2
	eps: 1e-06
	# optimizer arguments
	betas: [ 0.9, 0.98 ]
	weight_decay: 0.0

	# scheduler setup
	sched:
	name: NoamAnnealing
	min_lr: 0.001
	d_model: ${model.encoder.transformer.hidden_size}
	# Scheduler params
	warmup_steps: 15000
	warmup_ratio: null

	trainer:
	devices: 1 # number of gpus
	num_nodes: 1
	max_steps: -1 # computed at runtime if not set
	accelerator: gpu
	strategy: ddp
	accumulate_grad_batches: 1
	gradient_clip_val: 0.0
	precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
	log_every_n_steps: 100 # Interval of logging.
	resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
	num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
	check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
	sync_batchnorm: false
	enable_checkpointing: False # Provided by exp_manager
	logger: false # Provided by exp_manager

	exp_manager:
	exp_dir: null
	name: *name
	create_tensorboard_logger: true
	create_checkpoint_callback: true
	create_wandb_logger: false
	checkpoint_callback_params:
	monitor: "val_loss"
	mode: "min"
	save_top_k: 1
	always_save_nemo: true
	wandb_logger_kwargs:
	name: null
	project: null
	resume_if_exists: false
	resume_ignore_no_checkpoint: false

	hydra:
	run:
	dir: .
	job_logging:
	root:
	handlers: null