NeMo / examples /asr /conf /ssl /wav2vec /wav2vec_ci.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
# This config file contains parameters for training a Wav2Vec semi-supervised encoder
# These parameters are based off the FairSeq implementation.
# See here: https://github.com/pytorch/fairseq/blob/master/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml
# kernel: \[([0-9]+)\]
# filters: ([0-9]+)
name: &name "Wav2vec_Pretrain"
model:
sample_rate: &sample_rate 16000
feature_penalty: 0.0
dropout_features: 0.1 # Dropout applied to inputs to context encoder
dropout_features_q: 0.1 # Dropout applied to inputs to target quantizer
embedding_dim: &emb_dim 768 # Project size of embedidng dimension for transformer
final_dim: &final_dim 256 # Project final representations and targets to this dimension (target embeddings).
train_ds:
manifest_filepath: ???
sample_rate: *sample_rate
batch_size: 8
trim_silence: false
max_duration: 20.0
min_duration: 8.0
shuffle: true
is_tarred: false
tarred_audio_filepaths: null
use_start_end_token: false
num_workers: 8
pin_memory: true
# bucketing params
bucketing_strategy: "synced_randomized"
bucketing_batch_size: null
validation_ds:
manifest_filepath: ???
sample_rate: *sample_rate
batch_size: 8
shuffle: false
use_start_end_token: false
num_workers: 8
pin_memory: true
max_duration: 20.0
min_duration: 8.0
preprocessor:
_target_: nemo.collections.asr.modules.wav2vec_modules.ConvFeatureEncoder
extractor_mode: layer_norm # Mode for feature extractor. [group_norm, layer_norm]
conv_bias: False # Include bias in convolution feature extractor model
feature_grad_mult: 1.0 # Multiply extracted feature gradients
normalize_audio: true
embedding_dim: *emb_dim # projected final depth of feature embeddings
conv_layers:
- emb_dim: 512
kernel_size: 10
stride: 5
- emb_dim: 512
kernel_size: 3
stride: 2
- emb_dim: 512
kernel_size: 3
stride: 2
- emb_dim: 512
kernel_size: 3
stride: 2
- emb_dim: 512
kernel_size: 3
stride: 2
- emb_dim: 512
kernel_size: 2
stride: 2
- emb_dim: 512
kernel_size: 2
stride: 2
spec_augment:
_target_: nemo.collections.asr.modules.MaskedPatchAugmentation
freq_masks: 3
freq_width: 20
patch_size: 12
mask_patches: 0.5
encoder:
_target_: nemo.collections.asr.modules.wav2vec_modules.Wav2VecTransformerEncoder
layer_drop: 0.05
pos_embed: # Config for convolutional model that generates positional embeddings required for attention layer
embedding_dim: *emb_dim
conv_pos: 128 # Number of filters for convolutional positional embeddings
conv_pos_groups: 16 # Number of groups for convolutional positional embeddings
transformer: # Config for nemo.collections.nlp.modules.common.transformer.TransformerEncoder
num_layers: 6 # Number of encoder layers in transformer model
hidden_size: *emb_dim # Encoder embedding dim
inner_size: 1536 # Encoder embedding dim for feed forward
num_attention_heads: 4 # Number of encoder attention heads
attn_score_dropout: .1 # probability of dropout applied to attention scores
attn_layer_dropout: .1 # probability of dropout applied to the output of the attention layers, but before layer normalization
ffn_dropout: .1 # probability of dropout applied to FFN output
hidden_act: gelu # Activation for transformer
decoder:
_target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction
feat_in: *emb_dim
feat_hidden: *emb_dim
feat_out: *final_dim
stride_layers: 0
loss:
_target_: nemo.collections.asr.losses.ContrastiveLoss
in_dim: *emb_dim
proj_dim: *final_dim
quantized_targets: true # should quantizer or linear layer be used
sample_from_same_utterance_only: true # should negatives be sampled only from the same utterance
sample_from_non_masked: false # should negatives be sampled from non-masked steps
optim:
name: adamw
lr: 2
eps: 1e-06
# optimizer arguments
betas: [ 0.9, 0.98 ]
weight_decay: 0.0
# scheduler setup
sched:
name: NoamAnnealing
min_lr: 0.001
d_model: ${model.encoder.transformer.hidden_size}
# Scheduler params
warmup_steps: 15000
warmup_ratio: null
trainer:
devices: 1 # number of gpus
num_nodes: 1
max_steps: -1 # computed at runtime if not set
accelerator: gpu
strategy: ddp
accumulate_grad_batches: 1
gradient_clip_val: 0.0
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
log_every_n_steps: 100 # Interval of logging.
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
sync_batchnorm: false
enable_checkpointing: False # Provided by exp_manager
logger: false # Provided by exp_manager
exp_manager:
exp_dir: null
name: *name
create_tensorboard_logger: true
create_checkpoint_callback: true
create_wandb_logger: false
checkpoint_callback_params:
monitor: "val_loss"
mode: "min"
save_top_k: 1
always_save_nemo: true
wandb_logger_kwargs:
name: null
project: null
resume_if_exists: false
resume_ignore_no_checkpoint: false
hydra:
run:
dir: .
job_logging:
root:
handlers: null