In [None]:
"""
Please run notebook locally (if you have all the dependencies and a GPU). 
Technically you can run this notebook on Google Colab but you need to set up microphone for Colab.
 
Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
5. Set up microphone for Colab
"""
# If you're using Google Colab and not running locally, run this cell.

## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg portaudio19-dev
!pip install text-unidecode
!pip install pyaudio

# ## Install NeMo
BRANCH = 'r1.17.0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]

## Install TorchAudio
!pip install torchaudio>=0.13.0 -f https://download.pytorch.org/whl/torch_stable.html

This notebook demonstrates offline and online (from a microphone's stream in NeMo) speech commands recognition. The incompatibility of components could lead to failure of running this notebook locally with container, we might deprecate this notebook and provide a better tutorial in soon releases.

The notebook requires PyAudio library to get a signal from an audio device.
For Ubuntu, please run the following commands to install it:
```
sudo apt install python3-pyaudio
pip install pyaudio
```

This notebook requires the `torchaudio` library to be installed for MatchboxNet. Please follow the instructions available at the [torchaudio installer](https://github.com/NVIDIA/NeMo/blob/main/scripts/installers/install_torchaudio_latest.sh) and [torchaudio Github page](https://github.com/pytorch/audio#installation) to install the appropriate version of torchaudio.


In [None]:
import numpy as np
import pyaudio as pa
import os, time
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
%matplotlib inline

import nemo
import nemo.collections.asr as nemo_asr

In [None]:
# sample rate, Hz
SAMPLE_RATE = 16000

## Restore the model from NGC

In [None]:
mbn_model = nemo_asr.models.EncDecClassificationModel.from_pretrained("commandrecognition_en_matchboxnet3x1x64_v2")

Since speech commands model MatchBoxNet doesn't consider non-speech scenario, 
here we use a Voice Activity Detection (VAD) model to help reduce false alarm for background noise/silence. When there is speech activity detected, the speech command inference will be activated. 


**Please note the VAD model is not perfect for various microphone input and you might need to finetune on your input and play with different parameters.**

In [None]:
vad_model = nemo_asr.models.EncDecClassificationModel.from_pretrained('vad_marblenet')

## Observing the config of the model

In [None]:
from omegaconf import OmegaConf
import copy

In [None]:
# Preserve a copy of the full config
vad_cfg = copy.deepcopy(vad_model._cfg)
mbn_cfg = copy.deepcopy(mbn_model._cfg)
print(OmegaConf.to_yaml(mbn_cfg))

## What classes can this model recognize?

Before we begin inference on the actual audio stream, let's look at what are the classes this model was trained to recognize. 

**MatchBoxNet model is not designed to recognize words out of vocabulary (OOV).**

In [None]:
labels = mbn_cfg.labels
for i in range(len(labels)):
 print('%-10s' % (labels[i]), end=' ')

## Setup preprocessor with these settings

In [None]:
# Set model to inference mode
mbn_model.eval();
vad_model.eval();

## Setting up data for Streaming Inference

In [None]:
from nemo.core.classes import IterableDataset
from nemo.core.neural_types import NeuralType, AudioSignal, LengthsType
import torch
from torch.utils.data import DataLoader

In [None]:
# simple data layer to pass audio signal
class AudioDataLayer(IterableDataset):
 @property
 def output_types(self):
 return {
 'audio_signal': NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
 'a_sig_length': NeuralType(tuple('B'), LengthsType()),
 }

 def __init__(self, sample_rate):
 super().__init__()
 self._sample_rate = sample_rate
 self.output = True
 
 def __iter__(self):
 return self
 
 def __next__(self):
 if not self.output:
 raise StopIteration
 self.output = False
 return torch.as_tensor(self.signal, dtype=torch.float32), \
 torch.as_tensor(self.signal_shape, dtype=torch.int64)
 
 def set_signal(self, signal):
 self.signal = signal.astype(np.float32)/32768.
 self.signal_shape = self.signal.size
 self.output = True

 def __len__(self):
 return 1

In [None]:
data_layer = AudioDataLayer(sample_rate=mbn_cfg.train_ds.sample_rate)
data_loader = DataLoader(data_layer, batch_size=1, collate_fn=data_layer.collate_fn)

## inference method for audio signal (single instance)

In [None]:
def infer_signal(model, signal):
 data_layer.set_signal(signal)
 batch = next(iter(data_loader))
 audio_signal, audio_signal_len = batch
 audio_signal, audio_signal_len = audio_signal.to(model.device), audio_signal_len.to(model.device)
 logits = model.forward(input_signal=audio_signal, input_signal_length=audio_signal_len)
 return logits

we don't include postprocessing techniques here. 

In [None]:
# class for streaming frame-based ASR
# 1) use reset() method to reset FrameASR's state
# 2) call transcribe(frame) to do ASR on
# contiguous signal's frames
class FrameASR:
 
 def __init__(self, model_definition,
 frame_len=2, frame_overlap=2.5, 
 offset=0):
 '''
 Args:
 frame_len (seconds): Frame's duration
 frame_overlap (seconds): Duration of overlaps before and after current frame.
 offset: Number of symbols to drop for smooth streaming.
 '''
 self.task = model_definition['task']
 self.vocab = list(model_definition['labels'])
 
 self.sr = model_definition['sample_rate']
 self.frame_len = frame_len
 self.n_frame_len = int(frame_len * self.sr)
 self.frame_overlap = frame_overlap
 self.n_frame_overlap = int(frame_overlap * self.sr)
 timestep_duration = model_definition['AudioToMFCCPreprocessor']['window_stride']
 for block in model_definition['JasperEncoder']['jasper']:
 timestep_duration *= block['stride'][0] ** block['repeat']
 self.buffer = np.zeros(shape=2*self.n_frame_overlap + self.n_frame_len,
 dtype=np.float32)
 self.offset = offset
 self.reset()
 
 @torch.no_grad()
 def _decode(self, frame, offset=0):
 assert len(frame)==self.n_frame_len
 self.buffer[:-self.n_frame_len] = self.buffer[self.n_frame_len:]
 self.buffer[-self.n_frame_len:] = frame

 if self.task == 'mbn':
 logits = infer_signal(mbn_model, self.buffer).to('cpu').numpy()[0]
 decoded = self._mbn_greedy_decoder(logits, self.vocab)
 
 elif self.task == 'vad':
 logits = infer_signal(vad_model, self.buffer).to('cpu').numpy()[0]
 decoded = self._vad_greedy_decoder(logits, self.vocab)
 
 else:
 raise("Task should either be of mbn or vad!")
 
 return decoded[:len(decoded)-offset]
 
 def transcribe(self, frame=None,merge=False):
 if frame is None:
 frame = np.zeros(shape=self.n_frame_len, dtype=np.float32)
 if len(frame) < self.n_frame_len:
 frame = np.pad(frame, [0, self.n_frame_len - len(frame)], 'constant')
 unmerged = self._decode(frame, self.offset)
 return unmerged
 
 
 def reset(self):
 '''
 Reset frame_history and decoder's state
 '''
 self.buffer=np.zeros(shape=self.buffer.shape, dtype=np.float32)
 self.mbn_s = []
 self.vad_s = []
 
 @staticmethod
 def _mbn_greedy_decoder(logits, vocab):
 mbn_s = []
 if logits.shape[0]:
 class_idx = np.argmax(logits)
 class_label = vocab[class_idx]
 mbn_s.append(class_label) 
 return mbn_s
 
 
 @staticmethod
 def _vad_greedy_decoder(logits, vocab):
 vad_s = []
 if logits.shape[0]:
 probs = torch.softmax(torch.as_tensor(logits), dim=-1)
 probas, preds = torch.max(probs, dim=-1)
 vad_s = [preds.item(), str(vocab[preds]), probs[0].item(), probs[1].item(), str(logits)]
 return vad_s


# Streaming Inference

## offline inference
Here we show an example of offline streaming inference. you can use your file or download the provided demo audio file. 


Streaming inference depends on a few factors, such as the frame length (STEP) and buffer size (WINDOW SIZE). Experiment with a few values to see their effects in the below cells.

In [None]:
STEP = 0.25
WINDOW_SIZE = 1.28 # input segment length for NN we used for training

In [None]:
import wave

def offline_inference(wave_file, STEP = 0.25, WINDOW_SIZE = 0.31):
 """
 Arg:
 wav_file: wave file to be performed inference on.
 STEP: infer every STEP seconds 
 WINDOW_SIZE : lenght of audio to be sent to NN.
 """
 
 FRAME_LEN = STEP 
 CHANNELS = 1 # number of audio channels (expect mono signal)
 RATE = SAMPLE_RATE # sample rate, 16000 Hz
 
 CHUNK_SIZE = int(FRAME_LEN * SAMPLE_RATE)
 
 mbn = FrameASR(model_definition = {
 'task': 'mbn',
 'sample_rate': SAMPLE_RATE,
 'AudioToMFCCPreprocessor': mbn_cfg.preprocessor,
 'JasperEncoder': mbn_cfg.encoder,
 'labels': mbn_cfg.labels
 },
 frame_len=FRAME_LEN, frame_overlap = (WINDOW_SIZE - FRAME_LEN)/2,
 offset=0)

 wf = wave.open(wave_file, 'rb')
 data = wf.readframes(CHUNK_SIZE)

 while len(data) > 0:

 data = wf.readframes(CHUNK_SIZE)
 signal = np.frombuffer(data, dtype=np.int16)
 mbn_result = mbn.transcribe(signal)
 
 if len(mbn_result):
 print(mbn_result)
 
 mbn.reset()

In [None]:
demo_wave = 'SpeechCommands_demo.wav'
if not os.path.exists(demo_wave):
 !wget "https://dldata-public.s3.us-east-2.amazonaws.com/SpeechCommands_demo.wav"

In [None]:
wave_file = demo_wave

CHANNELS = 1
audio, sample_rate = librosa.load(wave_file, sr=SAMPLE_RATE)
dur = librosa.get_duration(audio)
print(dur)

In [None]:
ipd.Audio(audio, rate=sample_rate)

In [None]:
# Ground-truth is Yes No
offline_inference(wave_file, STEP, WINDOW_SIZE)

## Online inference through microphone

Please note MatchBoxNet and VAD model are not perfect for various microphone input and you might need to finetune on your input and play with different parameter. \
**We also recommend to use a headphone.**

In [None]:
vad_threshold = 0.8 

STEP = 0.1 
WINDOW_SIZE = 0.15
mbn_WINDOW_SIZE = 1

CHANNELS = 1 
RATE = SAMPLE_RATE
FRAME_LEN = STEP # use step of vad inference as frame len

CHUNK_SIZE = int(STEP * RATE)
vad = FrameASR(model_definition = {
 'task': 'vad',
 'sample_rate': SAMPLE_RATE,
 'AudioToMFCCPreprocessor': vad_cfg.preprocessor,
 'JasperEncoder': vad_cfg.encoder,
 'labels': vad_cfg.labels
 },
 frame_len=FRAME_LEN, frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2, 
 offset=0)

mbn = FrameASR(model_definition = {
 'task': 'mbn',
 'sample_rate': SAMPLE_RATE,
 'AudioToMFCCPreprocessor': mbn_cfg.preprocessor,
 'JasperEncoder': mbn_cfg.encoder,
 'labels': mbn_cfg.labels
 },
 frame_len=FRAME_LEN, frame_overlap = (mbn_WINDOW_SIZE-FRAME_LEN)/2,
 offset=0)

In [None]:
vad.reset()
mbn.reset()

# Setup input device
p = pa.PyAudio()
print('Available audio input devices:')
input_devices = []
for i in range(p.get_device_count()):
 dev = p.get_device_info_by_index(i)
 if dev.get('maxInputChannels'):
 input_devices.append(i)
 print(i, dev.get('name'))

if len(input_devices):
 dev_idx = -2
 while dev_idx not in input_devices:
 print('Please type input device ID:')
 dev_idx = int(input())

 
 def callback(in_data, frame_count, time_info, status):
 """
 callback function for streaming audio and performing inference
 """
 signal = np.frombuffer(in_data, dtype=np.int16)
 vad_result = vad.transcribe(signal) 
 mbn_result = mbn.transcribe(signal) 
 
 if len(vad_result):
 # if speech prob is higher than threshold, we decide it contains speech utterance 
 # and activate MatchBoxNet 
 if vad_result[3] >= vad_threshold: 
 print(mbn_result) # print mbn result when speech present
 else:
 print("no-speech")
 return (in_data, pa.paContinue)

 # streaming
 stream = p.open(format=pa.paInt16,
 channels=CHANNELS,
 rate=SAMPLE_RATE,
 input=True,
 input_device_index=dev_idx,
 stream_callback=callback,
 frames_per_buffer=CHUNK_SIZE)

 
 print('Listening...')
 stream.start_stream()
 
 # Interrupt kernel and then speak for a few more words to exit the pyaudio loop !
 try:
 while stream.is_active():
 time.sleep(0.1)
 finally: 
 stream.stop_stream()
 stream.close()
 p.terminate()
 print()
 print("PyAudio stopped")
 
else:
 print('ERROR: No audio input device found.')

## ONNX Deployment
You can also export the model to ONNX file and deploy it to TensorRT or MS ONNX Runtime inference engines. If you don't have one installed yet, please run:

In [None]:
!pip install --upgrade onnxruntime # for gpu, use onnxruntime-gpu
# !mkdir -p ort
# %cd ort
# !git clone --depth 1 --branch v1.8.0 https://github.com/microsoft/onnxruntime.git .
# !./build.sh --skip_tests --config Release --build_shared_lib --parallel --use_cuda --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu --build_wheel
# !pip install ./build/Linux/Release/dist/onnxruntime*.whl
# %cd ..

Then just replace `infer_signal` implementation with this code:

In [None]:
import onnxruntime
mbn_model.export('mbn.onnx')
ort_session = onnxruntime.InferenceSession('mbn.onnx')

def to_numpy(tensor):
 return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

def infer_signal(signal):
 data_layer.set_signal(signal)
 batch = next(iter(data_loader))
 audio_signal, audio_signal_len = batch
 audio_signal, audio_signal_len = audio_signal.to(mbn_model.device), audio_signal_len.to(mbn_model.device)
 processed_signal, processed_signal_len = mbn_model.preprocessor(
 input_signal=audio_signal, length=audio_signal_len,
 )
 ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(processed_signal), }
 ologits = ort_session.run(None, ort_inputs)
 alogits = np.asarray(ologits)
 logits = torch.from_numpy(alogits[0])
 return logits