File size: 6,412 Bytes
b197fed 860b6bb 3f878f1 860b6bb 98a6067 b197fed 6dbaaa5 e8ee9a7 98a6067 6dbaaa5 98a6067 6dbaaa5 b197fed 98a6067 b197fed 860b6bb b197fed 3f878f1 6dbaaa5 3f878f1 b197fed 3f878f1 1ba0470 860b6bb 3f878f1 6dbaaa5 1ba0470 6dbaaa5 1ba0470 6dbaaa5 1ba0470 6dbaaa5 3f878f1 6dbaaa5 3f878f1 6dbaaa5 3f878f1 6dbaaa5 3f878f1 6dbaaa5 3f878f1 6dbaaa5 3f878f1 98a6067 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import logging
from importlib.metadata import version
from timeit import default_timer as timer
import gradio as gr
import numpy as np
import onnx_asr
logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level=logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.info("onnx_asr version: %s", version("onnx_asr"))
vad = onnx_asr.load_vad("silero")
whisper = {name: onnx_asr.load_model(name) for name in ["whisper-base"]}
models_ru = {
name: onnx_asr.load_model(name)
for name in [
"gigaam-v2-ctc",
"gigaam-v2-rnnt",
"nemo-fastconformer-ru-ctc",
"nemo-fastconformer-ru-rnnt",
"alphacep/vosk-model-ru",
"alphacep/vosk-model-small-ru",
]
}
models_en = {
name: onnx_asr.load_model(name, quantization="int8")
for name in [
"nemo-parakeet-ctc-0.6b",
"nemo-parakeet-rnnt-0.6b",
]
}
models_vad = models_ru | models_en | whisper
def recognize(audio: tuple[int, np.ndarray], models, language):
if audio is None:
return None
sample_rate, waveform = audio
logger.debug("recognize: sample_rate %s, waveform.shape %s.", sample_rate, waveform.shape)
try:
waveform = waveform.astype(np.float32) / 2 ** (8 * waveform.itemsize - 1)
if waveform.ndim == 2:
waveform = waveform.mean(axis=1)
results = []
for name, model in models.items():
start = timer()
result = model.recognize(waveform, sample_rate=sample_rate, language=language)
time = timer() - start
logger.debug("recognized by %s: result '%s', time %.3f s.", name, result, time)
results.append([name, result, f"{time:.3f} s."])
except Exception as e:
raise gr.Error(f"{e} Audio: sample_rate: {sample_rate}, waveform.shape: {waveform.shape}.") from e
else:
return results
def recognize_ru(audio: tuple[int, np.ndarray]):
return recognize(audio, models_ru | whisper, "ru")
def recognize_en(audio: tuple[int, np.ndarray]):
return recognize(audio, models_en | whisper, "en")
def recognize_with_vad(audio: tuple[int, np.ndarray], name: str):
if audio is None:
return None
sample_rate, waveform = audio
logger.debug("recognize: sample_rate %s, waveform.shape %s.", sample_rate, waveform.shape)
try:
waveform = waveform.astype(np.float32) / 2 ** (8 * waveform.itemsize - 1)
if waveform.ndim == 2:
waveform = waveform.mean(axis=1)
model = models_vad[name].with_vad(vad, batch_size=1)
results = ""
for res in model.recognize(waveform, sample_rate=sample_rate):
logger.debug("recognized by %s: result '%s'.", name, res)
results += f"[{res.start:5.1f}, {res.end:5.1f}]: {res.text}\n"
yield results
except Exception as e:
raise gr.Error(f"{e} Audio: sample_rate: {sample_rate}, waveform.shape: {waveform.shape}.") from e
with gr.Blocks() as recognize_short:
audio = gr.Audio(min_length=1, max_length=20)
with gr.Row():
gr.ClearButton(audio)
btn_ru = gr.Button("Recognize (ru)", variant="primary")
btn_en = gr.Button("Recognize (en)", variant="primary")
output = gr.Dataframe(headers=["model", "result", "time"], wrap=True)
btn_ru.click(fn=recognize_ru, inputs=audio, outputs=output)
btn_en.click(fn=recognize_en, inputs=audio, outputs=output)
with gr.Blocks() as recognize_long:
name = gr.Dropdown(models_vad.keys(), label="Model")
audio = gr.Audio(min_length=1, max_length=300)
with gr.Row():
gr.ClearButton(audio)
btn = gr.Button("Recognize", variant="primary")
output = gr.TextArea(label="result") # headers=["start", "end", "result"], wrap=True, every=0.1)
btn.click(fn=recognize_with_vad, inputs=[audio, name], outputs=output)
with gr.Blocks() as demo:
gr.Markdown("""
# ASR demo using onnx-asr
**[onnx-asr](https://github.com/istupakov/onnx-asr)** is a Python package for Automatic Speech Recognition using ONNX models.
The package is written in pure Python with minimal dependencies (no `pytorch` or `transformers`).
""")
gr.TabbedInterface(
[recognize_short, recognize_long],
[
"Recognition of a short phrase (up to 20 sec.)",
"Recognition of a long phrase with VAD (up to 5 min.)",
],
)
with gr.Accordion("Models used in this demo...", open=False):
gr.Markdown("""
## ASR models
* `gigaam-v2-ctc` - Sber GigaAM v2 CTC ([origin](https://github.com/salute-developers/GigaAM), [onnx](https://huggingface.co/istupakov/gigaam-v2-onnx))
* `gigaam-v2-rnnt` - Sber GigaAM v2 RNN-T ([origin](https://github.com/salute-developers/GigaAM), [onnx](https://huggingface.co/istupakov/gigaam-v2-onnx))
* `nemo-fastconformer-ru-ctc` - Nvidia FastConformer-Hybrid Large (ru) with CTC decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
* `nemo-fastconformer-ru-rnnt` - Nvidia FastConformer-Hybrid Large (ru) with RNN-T decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
* `nemo-parakeet-ctc-0.6b` - Nvidia Parakeet CTC 0.6B (en) ([origin](https://huggingface.co/nvidia/parakeet-ctc-0.6b), [onnx](https://huggingface.co/istupakov/parakeet-ctc-0.6b-onnx))
* `nemo-parakeet-rnnt-0.6b` - Nvidia Parakeet RNNT 0.6B (en) ([origin](https://huggingface.co/nvidia/parakeet-rnnt-0.6b), [onnx](https://huggingface.co/istupakov/parakeet-rnnt-0.6b-onnx))
* `whisper-base` - OpenAI Whisper Base exported with onnxruntime ([origin](https://huggingface.co/openai/whisper-base), [onnx](https://huggingface.co/istupakov/whisper-base-onnx))
* `alphacep/vosk-model-ru` - Alpha Cephei Vosk 0.54-ru ([origin](https://huggingface.co/alphacep/vosk-model-ru))
* `alphacep/vosk-model-small-ru` - Alpha Cephei Vosk 0.52-small-ru ([origin](https://huggingface.co/alphacep/vosk-model-small-ru))
## VAD models
* `silero` - Silero VAD ([origin](https://github.com/snakers4/silero-vad), [onnx](https://huggingface.co/onnx-community/silero-vad))
""")
demo.launch()
|