Spaces:

Grosy
/

Hu_ASR

Sleeping

File size: 3,129 Bytes

import gradio as gr
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import pipeline
import librosa


model_name = "Grosy/wav2vec2-base-hu"

#tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name)
#processor = Wav2Vec2Processor.from_pretrained(model_name)
#model = Wav2Vec2ForCTC.from_pretrained(model_name)
#model.to("cpu")

max_seconds = 30
# define function to read in sound file
def speech_file_to_array_fn(path, max_seconds=10):
    batch = {"file": path}
    speech_array, sampling_rate = librosa.load(batch["file"], sr=16000)
    if max_seconds > 0:
        speech_array = speech_array[: max_seconds * 16000]
    batch["speech"] = speech_array
    batch["sampling_rate"] = 16000
    return batch


# tokenize
def inference(audio):
    # read in sound file
    # load dummy dataset and read soundfiles
    sp = speech_file_to_array_fn(audio.name, max_seconds)

    sample_rate = 16000
    # stride_length_s is a tuple of the left and right stride length.
    # With only 1 number, both sides get the same stride, by default
    # the stride_length on one side is 1/6th of the chunk_length_s
    input_values = processor(
        sp["speech"],
        sample_rate=sample_rate,
        chunk_length_s=10,
        stride_length_s=(4, 2),
        return_tensors="pt",
    ).input_values

    with torch.no_grad():
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, axis=-1).cpu().tolist()
    prediction = tokenizer.decode(pred_ids[0], output_word_offsets=True)

    time_offset = 320 / sample_rate

    total_prediction = []
    words = []
    for item in prediction.word_offsets:
        r = item

        s = round(r['start_offset'] * time_offset, 2)
        e = round(r['end_offset'] * time_offset, 2)

        total_prediction.append(f"{s} - {e}: {r['word']}")
        words.append(r['word'].lower())

    print(prediction[0])

    return "\n".join(total_prediction) + "\n\n" + ' '.join(words)

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model_name,
    chunk_length_s=10,
    device="cpu",
)

def transcribe(inputs):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    text = pipe(inputs, batch_size=1, return_timestamps='word')["text"]
    return  text

inputs = gr.Audio(label="Input Audio", sources="microphone", type="filepath")
outputs = "text"
title = model_name
description = f"Gradio demo for a {model_name}. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav 16_000hz files, max duration of {max_seconds} sec"
article = "<p style='text-align: center'><a href='https://github.com/GrosyT/GrosyT.github.io' target='_blank'> Github repo</a> | <a href='<HF Space link>' target='_blank'>Pretrained model</a> </p>"
examples = [
    ["sample1.mp3"],
    ["sample2.mp3"],
]
gr.Interface(
    transcribe,
    inputs,
    outputs,
    title=title,
    description=description,
    article=article,
    examples=examples,
).launch()