|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from dataclasses import asdict |
|
|
|
import pytorch_lightning as pl |
|
|
|
import nemo.collections.asr as nemo_asr |
|
from nemo.collections.asr.models import EncDecCTCModel, configs |
|
from nemo.utils.exp_manager import exp_manager |
|
|
|
""" |
|
python speech_to_text_structured.py |
|
""" |
|
|
|
|
|
cfg = configs.EncDecCTCModelConfig() |
|
|
|
|
|
cfg.model.repeat = 5 |
|
cfg.model.separable = True |
|
|
|
|
|
LABELS = [ |
|
" ", "a", "b", "c", "d", "e", |
|
"f", "g", "h", "i", "j", "k", |
|
"l", "m", "n", "o", "p", "q", |
|
"r", "s", "t", "u", "v", "w", |
|
"x", "y", "z", "'", |
|
] |
|
|
|
|
|
qn_15x5 = [ |
|
nemo_asr.modules.conv_asr.JasperEncoderConfig( |
|
filters=256, |
|
repeat=1, |
|
kernel=[33], |
|
stride=[2], |
|
separable=cfg.model.separable, |
|
dilation=[1], |
|
dropout=cfg.model.dropout, |
|
residual=False, |
|
), |
|
nemo_asr.modules.conv_asr.JasperEncoderConfig( |
|
filters=256, |
|
repeat=1, |
|
kernel=[33], |
|
stride=[1], |
|
separable=cfg.model.separable, |
|
dilation=[1], |
|
dropout=cfg.model.dropout, |
|
residual=True, |
|
), |
|
|
|
nemo_asr.modules.conv_asr.JasperEncoderConfig( |
|
filters=1024, repeat=1, kernel=[1], stride=[1], dilation=[1], dropout=cfg.model.dropout, residual=False, |
|
), |
|
] |
|
|
|
|
|
def main(): |
|
|
|
|
|
cfg.name = "Mini QuartzNet" |
|
cfg.model.labels = LABELS |
|
|
|
|
|
cfg.model.train_ds.manifest_filepath = "<path to train dataset>" |
|
cfg.model.train_ds.labels = LABELS |
|
cfg.model.train_ds.sample_rate = cfg.model.sample_rate |
|
|
|
|
|
cfg.model.validation_ds.manifest_filepath = "<path to test dataset>" |
|
cfg.model.validation_ds.labels = LABELS |
|
cfg.model.validation_ds.sample_rate = cfg.model.sample_rate |
|
|
|
|
|
|
|
|
|
|
|
cfg.model.test_ds.sample_rate = cfg.model.sample_rate |
|
cfg.model.test_ds.labels = cfg.model.labels |
|
|
|
|
|
cfg.model.preprocessor.sample_rate = cfg.model.sample_rate |
|
|
|
|
|
cfg.model.spec_augment.rect_masks = 5 |
|
cfg.model.spec_augment.rect_freq = 50 |
|
cfg.model.spec_augment.rect_time = 120 |
|
|
|
|
|
cfg.model.encoder.feat_in = cfg.model.preprocessor.features |
|
cfg.model.encoder.activation = 'relu' |
|
cfg.model.encoder.jasper = qn_15x5 |
|
|
|
|
|
cfg.model.decoder.feat_in = qn_15x5[-1].filters |
|
cfg.model.decoder.num_classes = len(LABELS) |
|
cfg.model.decoder.vocabulary = LABELS |
|
|
|
|
|
cfg.model.optim.name = 'novograd' |
|
cfg.model.optim.lr = 0.01 |
|
|
|
|
|
|
|
|
|
cfg.model.optim.betas = [0.8, 0.5] |
|
cfg.model.optim.weight_decay = 0.001 |
|
|
|
|
|
|
|
|
|
cfg.model.optim.sched.name = "CosineAnnealing" |
|
cfg.model.optim.sched.warmup_steps = None |
|
cfg.model.optim.sched.warmup_ratio = 0.01 |
|
|
|
|
|
cfg.trainer.devices = 1 |
|
cfg.trainer.max_epochs = 5 |
|
|
|
|
|
cfg.exp_manager.name = cfg.name |
|
|
|
|
|
trainer = pl.Trainer(**asdict(cfg.trainer)) |
|
exp_manager(trainer, asdict(cfg.exp_manager)) |
|
asr_model = EncDecCTCModel(cfg=cfg.model, trainer=trainer) |
|
|
|
trainer.fit(asr_model) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|