File size: 6,412 Bytes
b197fed
860b6bb
3f878f1
860b6bb
98a6067
 
 
 
 
b197fed
 
 
 
 
6dbaaa5
 
 
 
 
e8ee9a7
 
 
 
 
 
 
 
 
 
98a6067
6dbaaa5
 
 
 
 
 
 
 
 
 
98a6067
6dbaaa5
b197fed
 
 
98a6067
b197fed
860b6bb
 
b197fed
 
 
3f878f1
 
 
6dbaaa5
3f878f1
b197fed
3f878f1
1ba0470
860b6bb
 
3f878f1
 
 
 
6dbaaa5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ba0470
 
6dbaaa5
 
1ba0470
 
6dbaaa5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ba0470
6dbaaa5
 
3f878f1
 
6dbaaa5
3f878f1
 
 
6dbaaa5
 
 
 
 
 
 
 
3f878f1
6dbaaa5
3f878f1
 
 
 
6dbaaa5
 
 
3f878f1
 
6dbaaa5
 
3f878f1
98a6067
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import logging
from importlib.metadata import version
from timeit import default_timer as timer

import gradio as gr
import numpy as np

import onnx_asr

logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level=logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.info("onnx_asr version: %s", version("onnx_asr"))

vad = onnx_asr.load_vad("silero")

whisper = {name: onnx_asr.load_model(name) for name in ["whisper-base"]}

models_ru = {
    name: onnx_asr.load_model(name)
    for name in [
        "gigaam-v2-ctc",
        "gigaam-v2-rnnt",
        "nemo-fastconformer-ru-ctc",
        "nemo-fastconformer-ru-rnnt",
        "alphacep/vosk-model-ru",
        "alphacep/vosk-model-small-ru",
    ]
}

models_en = {
    name: onnx_asr.load_model(name, quantization="int8")
    for name in [
        "nemo-parakeet-ctc-0.6b",
        "nemo-parakeet-rnnt-0.6b",
    ]
}

models_vad = models_ru | models_en | whisper


def recognize(audio: tuple[int, np.ndarray], models, language):
    if audio is None:
        return None

    sample_rate, waveform = audio
    logger.debug("recognize: sample_rate %s, waveform.shape %s.", sample_rate, waveform.shape)
    try:
        waveform = waveform.astype(np.float32) / 2 ** (8 * waveform.itemsize - 1)
        if waveform.ndim == 2:
            waveform = waveform.mean(axis=1)

        results = []
        for name, model in models.items():
            start = timer()
            result = model.recognize(waveform, sample_rate=sample_rate, language=language)
            time = timer() - start
            logger.debug("recognized by %s: result '%s', time %.3f s.", name, result, time)
            results.append([name, result, f"{time:.3f} s."])

    except Exception as e:
        raise gr.Error(f"{e} Audio: sample_rate: {sample_rate}, waveform.shape: {waveform.shape}.") from e
    else:
        return results


def recognize_ru(audio: tuple[int, np.ndarray]):
    return recognize(audio, models_ru | whisper, "ru")


def recognize_en(audio: tuple[int, np.ndarray]):
    return recognize(audio, models_en | whisper, "en")


def recognize_with_vad(audio: tuple[int, np.ndarray], name: str):
    if audio is None:
        return None

    sample_rate, waveform = audio
    logger.debug("recognize: sample_rate %s, waveform.shape %s.", sample_rate, waveform.shape)
    try:
        waveform = waveform.astype(np.float32) / 2 ** (8 * waveform.itemsize - 1)
        if waveform.ndim == 2:
            waveform = waveform.mean(axis=1)

        model = models_vad[name].with_vad(vad, batch_size=1)
        results = ""
        for res in model.recognize(waveform, sample_rate=sample_rate):
            logger.debug("recognized by %s: result '%s'.", name, res)
            results += f"[{res.start:5.1f}, {res.end:5.1f}]: {res.text}\n"
            yield results

    except Exception as e:
        raise gr.Error(f"{e} Audio: sample_rate: {sample_rate}, waveform.shape: {waveform.shape}.") from e


with gr.Blocks() as recognize_short:
    audio = gr.Audio(min_length=1, max_length=20)
    with gr.Row():
        gr.ClearButton(audio)
        btn_ru = gr.Button("Recognize (ru)", variant="primary")
        btn_en = gr.Button("Recognize (en)", variant="primary")
    output = gr.Dataframe(headers=["model", "result", "time"], wrap=True)
    btn_ru.click(fn=recognize_ru, inputs=audio, outputs=output)
    btn_en.click(fn=recognize_en, inputs=audio, outputs=output)

with gr.Blocks() as recognize_long:
    name = gr.Dropdown(models_vad.keys(), label="Model")
    audio = gr.Audio(min_length=1, max_length=300)
    with gr.Row():
        gr.ClearButton(audio)
        btn = gr.Button("Recognize", variant="primary")
    output = gr.TextArea(label="result")  # headers=["start", "end", "result"], wrap=True, every=0.1)
    btn.click(fn=recognize_with_vad, inputs=[audio, name], outputs=output)

with gr.Blocks() as demo:
    gr.Markdown("""
    # ASR demo using onnx-asr
    **[onnx-asr](https://github.com/istupakov/onnx-asr)** is a Python package for Automatic Speech Recognition using ONNX models.
    The package is written in pure Python with minimal dependencies (no `pytorch` or `transformers`).
    """)
    gr.TabbedInterface(
        [recognize_short, recognize_long],
        [
            "Recognition of a short phrase (up to 20 sec.)",
            "Recognition of a long phrase with VAD (up to 5 min.)",
        ],
    )
    with gr.Accordion("Models used in this demo...", open=False):
        gr.Markdown("""
        ## ASR models
        * `gigaam-v2-ctc` - Sber GigaAM v2 CTC ([origin](https://github.com/salute-developers/GigaAM), [onnx](https://huggingface.co/istupakov/gigaam-v2-onnx))
        * `gigaam-v2-rnnt` - Sber GigaAM v2 RNN-T ([origin](https://github.com/salute-developers/GigaAM), [onnx](https://huggingface.co/istupakov/gigaam-v2-onnx))
        * `nemo-fastconformer-ru-ctc` - Nvidia FastConformer-Hybrid Large (ru) with CTC decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
        * `nemo-fastconformer-ru-rnnt` - Nvidia FastConformer-Hybrid Large (ru) with RNN-T decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
        * `nemo-parakeet-ctc-0.6b` - Nvidia Parakeet CTC 0.6B (en) ([origin](https://huggingface.co/nvidia/parakeet-ctc-0.6b), [onnx](https://huggingface.co/istupakov/parakeet-ctc-0.6b-onnx))
        * `nemo-parakeet-rnnt-0.6b` - Nvidia Parakeet RNNT 0.6B (en) ([origin](https://huggingface.co/nvidia/parakeet-rnnt-0.6b), [onnx](https://huggingface.co/istupakov/parakeet-rnnt-0.6b-onnx))
        * `whisper-base` - OpenAI Whisper Base exported with onnxruntime ([origin](https://huggingface.co/openai/whisper-base), [onnx](https://huggingface.co/istupakov/whisper-base-onnx))
        * `alphacep/vosk-model-ru` - Alpha Cephei Vosk 0.54-ru ([origin](https://huggingface.co/alphacep/vosk-model-ru))
        * `alphacep/vosk-model-small-ru` - Alpha Cephei Vosk 0.52-small-ru ([origin](https://huggingface.co/alphacep/vosk-model-small-ru))
        ## VAD models
        * `silero` - Silero VAD ([origin](https://github.com/snakers4/silero-vad), [onnx](https://huggingface.co/onnx-community/silero-vad))
        """)

demo.launch()