|
|
|
|
|
|
|
|
|
|
|
|
|
name: &name "Wav2vec_Pretrain" |
|
|
|
model: |
|
sample_rate: &sample_rate 16000 |
|
feature_penalty: 0.0 |
|
dropout_features: 0.1 |
|
dropout_features_q: 0.1 |
|
embedding_dim: &emb_dim 768 |
|
final_dim: &final_dim 256 |
|
|
|
train_ds: |
|
manifest_filepath: ??? |
|
sample_rate: *sample_rate |
|
batch_size: 8 |
|
trim_silence: false |
|
max_duration: 20.0 |
|
min_duration: 8.0 |
|
shuffle: true |
|
is_tarred: false |
|
tarred_audio_filepaths: null |
|
use_start_end_token: false |
|
num_workers: 8 |
|
pin_memory: true |
|
|
|
bucketing_strategy: "synced_randomized" |
|
bucketing_batch_size: null |
|
|
|
validation_ds: |
|
manifest_filepath: ??? |
|
sample_rate: *sample_rate |
|
batch_size: 8 |
|
shuffle: false |
|
use_start_end_token: false |
|
num_workers: 8 |
|
pin_memory: true |
|
max_duration: 20.0 |
|
min_duration: 8.0 |
|
|
|
preprocessor: |
|
_target_: nemo.collections.asr.modules.wav2vec_modules.ConvFeatureEncoder |
|
extractor_mode: layer_norm |
|
conv_bias: False |
|
feature_grad_mult: 1.0 |
|
normalize_audio: true |
|
embedding_dim: *emb_dim |
|
conv_layers: |
|
- emb_dim: 512 |
|
kernel_size: 10 |
|
stride: 5 |
|
- emb_dim: 512 |
|
kernel_size: 3 |
|
stride: 2 |
|
- emb_dim: 512 |
|
kernel_size: 3 |
|
stride: 2 |
|
- emb_dim: 512 |
|
kernel_size: 3 |
|
stride: 2 |
|
- emb_dim: 512 |
|
kernel_size: 3 |
|
stride: 2 |
|
- emb_dim: 512 |
|
kernel_size: 2 |
|
stride: 2 |
|
- emb_dim: 512 |
|
kernel_size: 2 |
|
stride: 2 |
|
|
|
spec_augment: |
|
_target_: nemo.collections.asr.modules.MaskedPatchAugmentation |
|
freq_masks: 3 |
|
freq_width: 20 |
|
patch_size: 12 |
|
mask_patches: 0.5 |
|
|
|
encoder: |
|
_target_: nemo.collections.asr.modules.wav2vec_modules.Wav2VecTransformerEncoder |
|
layer_drop: 0.05 |
|
pos_embed: |
|
embedding_dim: *emb_dim |
|
conv_pos: 128 |
|
conv_pos_groups: 16 |
|
transformer: |
|
num_layers: 6 |
|
hidden_size: *emb_dim |
|
inner_size: 1536 |
|
num_attention_heads: 4 |
|
attn_score_dropout: .1 |
|
attn_layer_dropout: .1 |
|
ffn_dropout: .1 |
|
hidden_act: gelu |
|
|
|
decoder: |
|
_target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction |
|
feat_in: *emb_dim |
|
feat_hidden: *emb_dim |
|
feat_out: *final_dim |
|
stride_layers: 0 |
|
|
|
loss: |
|
_target_: nemo.collections.asr.losses.ContrastiveLoss |
|
in_dim: *emb_dim |
|
proj_dim: *final_dim |
|
quantized_targets: true |
|
sample_from_same_utterance_only: true |
|
sample_from_non_masked: false |
|
|
|
optim: |
|
name: adamw |
|
lr: 2 |
|
eps: 1e-06 |
|
|
|
betas: [ 0.9, 0.98 ] |
|
weight_decay: 0.0 |
|
|
|
|
|
sched: |
|
name: NoamAnnealing |
|
min_lr: 0.001 |
|
d_model: ${model.encoder.transformer.hidden_size} |
|
|
|
warmup_steps: 15000 |
|
warmup_ratio: null |
|
|
|
trainer: |
|
devices: 1 |
|
num_nodes: 1 |
|
max_steps: -1 |
|
accelerator: gpu |
|
strategy: ddp |
|
accumulate_grad_batches: 1 |
|
gradient_clip_val: 0.0 |
|
precision: 32 |
|
log_every_n_steps: 100 |
|
resume_from_checkpoint: null |
|
num_sanity_val_steps: 0 |
|
check_val_every_n_epoch: 1 |
|
sync_batchnorm: false |
|
enable_checkpointing: False |
|
logger: false |
|
|
|
exp_manager: |
|
exp_dir: null |
|
name: *name |
|
create_tensorboard_logger: true |
|
create_checkpoint_callback: true |
|
create_wandb_logger: false |
|
checkpoint_callback_params: |
|
monitor: "val_loss" |
|
mode: "min" |
|
save_top_k: 1 |
|
always_save_nemo: true |
|
wandb_logger_kwargs: |
|
name: null |
|
project: null |
|
resume_if_exists: false |
|
resume_ignore_no_checkpoint: false |
|
|
|
hydra: |
|
run: |
|
dir: . |
|
job_logging: |
|
root: |
|
handlers: null |
|
|