|
import gradio as gr |
|
from transformers import pipeline |
|
from librosa import resample |
|
import numpy as np |
|
|
|
def transcribe(input_audio): |
|
sr, speech = input_audio |
|
|
|
if speech.ndim > 1: |
|
speech = speech.mean(axis=1) |
|
|
|
if speech.dtype != "float32": |
|
speech = speech.astype(np.float32) |
|
|
|
if sr!=16000: |
|
speech = resample(speech, orig_sr=sr, target_sr=16000) |
|
output = pipe(speech, chunk_length_s=30, stride_length_s=5)['text'] |
|
return output |
|
|
|
pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model="GetmanY1/wav2vec2-large-sami-cont-pt-22k-finetuned", |
|
device="cpu" |
|
) |
|
|
|
gradio_app = gr.Interface( |
|
fn=transcribe, |
|
inputs=gr.Audio(sources=["upload","microphone"]), |
|
outputs="text", |
|
title="Sámi Automatic Speech Recognition", |
|
) |
|
|
|
if __name__ == "__main__": |
|
gradio_app.launch() |
|
|
|
|
|
|
|
|
|
|