NeMo / examples /asr /conf /ssl /contextnet /contextnet_ssl.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
# This config contains the default values for self-supervised pre-training of ContextNet encoder.
# In contrast to original ContextNet, the same number of filters is used throughout the model.
# Default learning parameters in this config are set for effective batch size of 1K. To train it with smaller effective
# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
# Here are the recommended configs for different variants of ContextNet, other parameters are the same as in this config file.
#
# +-------------+---------+------------+
# | Model | filters | time_masks |
# +=============+=========+============+
# | Small (14M)| 256 | 2 |
# +-------------+---------+------------+
# | Medium (40M)| 512 | 5 |
# +-------------+---------+------------+
# | Large (145M)| 1024 | 10 |
# +-------------------------------------
name: &name "ContextNet-8x-Stride-SSL"
model:
sample_rate: &sample_rate 16000
train_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 16 # Can be increased if memory allows or when using smaller model
trim_silence: false
max_duration: 16.7
min_duration: 8.0
shuffle: true
use_start_end_token: false
num_workers: 16
pin_memory: true
# tarred datasets
is_tarred: false
tarred_audio_filepaths: null
tarred_shard_strategy: "scatter"
shuffle_n: 2048
# bucketing params
bucketing_strategy: "synced_randomized"
bucketing_batch_size: null
validation_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 8
shuffle: false
use_start_end_token: false
num_workers: 16
pin_memory: true
min_duration: 8.0
model_defaults:
filters: 1024
repeat: 5
dropout: 0.1
separable: true
se: true
se_context_size: -1
kernel_size_factor: 1.0
enc_hidden: 640
decoder_out_channels: 128
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
sample_rate: ${model.sample_rate}
normalize: "per_feature"
window_size: 0.025
window_stride: 0.01
window: "hann"
features: &n_mels 80
n_fft: 512
frame_splicing: 1
dither: 0.00001
pad_to: 16
stft_conv: false
spec_augment:
_target_: nemo.collections.asr.modules.MaskedPatchAugmentation
freq_masks: 3
freq_width: 20
patch_size: 48
mask_patches: 0.5
encoder:
_target_: nemo.collections.asr.modules.ConvASREncoder
feat_in: *n_mels
activation: swish
conv_mask: true
init_mode: "tds_uniform"
jasper:
- filters: ${model.model_defaults.filters}
repeat: 1
kernel: [5]
stride: [1]
dilation: [1]
dropout: 0.0
residual: false
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [2]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
stride_last: true
residual_mode: "stride_add"
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [2] # *stride
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
stride_last: true
residual_mode: "stride_add"
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [2] # stride
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
stride_last: true
residual_mode: "stride_add"
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.enc_hidden}
repeat: 1
kernel: [5]
stride: [1]
dilation: [1]
dropout: 0.0
residual: false
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
loss_list:
contrastive:
decoder:
_target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction
feat_in: ${model.model_defaults.enc_hidden}
feat_hidden: 128
# features in hidden layer of decoder
feat_out: ${model.model_defaults.decoder_out_channels}
stride_layers: 1
# if loss.combine_time_steps is different than the encoder stride,
# then a corresponding amount of stride_layers needs to
# be added to the decoder (here stride is 8 and combine_time_steps is 4)
non_stride_layers: 0
stride_transpose: true
apply_softmax: false
loss:
_target_: nemo.collections.asr.losses.ContrastiveLoss
in_dim: ${model.preprocessor.features}
proj_dim: ${model.model_defaults.decoder_out_channels}
combine_time_steps: 4 #how many spectrogram time steps are used for one target/representation for contrastive task
quantized_targets: true #should quantizer or linear layer be used
# (quantizer is required to extract pseudo-labels for other losses)
codebook_size: 300 # number of vectors in the quantization codebook per group
num_groups: 2 # number of groups in the quantizer codebook
num_negatives: 100 # number of sampled negatives for each target
sample_from_same_utterance_only: true #should negatives be sampled only from the same utterance
sample_from_non_masked: false #should negatives be sampled from non-masked steps
mlm:
decoder:
_target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction
feat_in: ${model.model_defaults.enc_hidden}
feat_hidden: 128
# features in hidden layer of decoder
feat_out: 90000
# this should be equal to codebook_size^groups in the contrastive loss to match the targets
stride_layers: 1
stride_transpose: true
activation: "identity"
apply_softmax: true
loss:
_target_: nemo.collections.asr.losses.MLMLoss
combine_time_steps: 4
targets_from_loss: "contrastive"
loss_alpha: 1000.
optim:
name: adamw
lr: 5.0
# optimizer arguments
betas: [0.9, 0.98]
weight_decay: 1e-3
# scheduler setup
sched:
name: NoamAnnealing
d_model: ${model.model_defaults.enc_hidden}
# scheduler config override
warmup_steps: 25000
warmup_ratio: null
min_lr: 1e-6
trainer:
devices: -1 # number of GPUs, -1 would use all available GPUs
num_nodes: 1
max_epochs: 1000
max_steps: -1 # computed at runtime if not set
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
accelerator: auto
strategy: ddp
accumulate_grad_batches: 1
gradient_clip_val: 1.0
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
log_every_n_steps: 10 # Interval of logging.
enable_progress_bar: True
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
sync_batchnorm: true
enable_checkpointing: False # Provided by exp_manager
logger: false # Provided by exp_manager
benchmark: false # needs to be false for models with variable-length speech input as it slows down training
exp_manager:
exp_dir: null
name: ${name}
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
# in case of multiple validation sets, first one is used
monitor: "val_loss"
mode: "min"
save_top_k: 5
# you need to set these two to True to continue the training
resume_if_exists: false
resume_ignore_no_checkpoint: false
# You may use this section to create a W&B logger
create_wandb_logger: false
wandb_logger_kwargs:
name: null
project: null