|
import gradio as gr |
|
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor |
|
from transformers import pipeline |
|
import librosa |
|
|
|
|
|
model_name = "Grosy/wav2vec2-base-hu" |
|
|
|
|
|
|
|
|
|
|
|
|
|
max_seconds = 30 |
|
|
|
def speech_file_to_array_fn(path, max_seconds=10): |
|
batch = {"file": path} |
|
speech_array, sampling_rate = librosa.load(batch["file"], sr=16000) |
|
if max_seconds > 0: |
|
speech_array = speech_array[: max_seconds * 16000] |
|
batch["speech"] = speech_array |
|
batch["sampling_rate"] = 16000 |
|
return batch |
|
|
|
|
|
|
|
def inference(audio): |
|
|
|
|
|
sp = speech_file_to_array_fn(audio.name, max_seconds) |
|
|
|
sample_rate = 16000 |
|
|
|
|
|
|
|
input_values = processor( |
|
sp["speech"], |
|
sample_rate=sample_rate, |
|
chunk_length_s=10, |
|
stride_length_s=(4, 2), |
|
return_tensors="pt", |
|
).input_values |
|
|
|
with torch.no_grad(): |
|
logits = model(input_values).logits |
|
|
|
pred_ids = torch.argmax(logits, axis=-1).cpu().tolist() |
|
prediction = tokenizer.decode(pred_ids[0], output_word_offsets=True) |
|
|
|
time_offset = 320 / sample_rate |
|
|
|
total_prediction = [] |
|
words = [] |
|
for item in prediction.word_offsets: |
|
r = item |
|
|
|
s = round(r['start_offset'] * time_offset, 2) |
|
e = round(r['end_offset'] * time_offset, 2) |
|
|
|
total_prediction.append(f"{s} - {e}: {r['word']}") |
|
words.append(r['word'].lower()) |
|
|
|
print(prediction[0]) |
|
|
|
return "\n".join(total_prediction) + "\n\n" + ' '.join(words) |
|
|
|
pipe = pipeline( |
|
task="automatic-speech-recognition", |
|
model=model_name, |
|
chunk_length_s=10, |
|
device="cpu", |
|
) |
|
|
|
def transcribe(inputs): |
|
if inputs is None: |
|
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.") |
|
|
|
text = pipe(inputs, batch_size=1, return_timestamps='word')["text"] |
|
return text |
|
|
|
inputs = gr.Audio(label="Input Audio", sources="microphone", type="filepath") |
|
outputs = "text" |
|
title = model_name |
|
description = f"Gradio demo for a {model_name}. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav 16_000hz files, max duration of {max_seconds} sec" |
|
article = "<p style='text-align: center'><a href='https://github.com/GrosyT/GrosyT.github.io' target='_blank'> Github repo</a> | <a href='<HF Space link>' target='_blank'>Pretrained model</a> </p>" |
|
examples = [ |
|
["sample1.mp3"], |
|
["sample2.mp3"], |
|
] |
|
gr.Interface( |
|
transcribe, |
|
inputs, |
|
outputs, |
|
title=title, |
|
description=description, |
|
article=article, |
|
examples=examples, |
|
).launch() |
|
|