Spaces:

Grosy
/

Hu_ASR

Sleeping

App Files Files Community

Grosy commited on Nov 14, 2024

Commit

a485a9b

verified ·

1 Parent(s): 130e8c3

Initial version of the app

Browse files

Files changed (1) hide show

app.py +84 -0

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import gradio as gr
+from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
+import librosa
+model_name = "Grosy/wav2vec2-base-hu"
+tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name)
+processor = Wav2Vec2Processor.from_pretrained(model_name)
+model = Wav2Vec2ForCTC.from_pretrained(model_name)
+model.to("cpu")
+max_seconds = 30
+# define function to read in sound file
+def speech_file_to_array_fn(path, max_seconds=10):
+    batch = {"file": path}
+    speech_array, sampling_rate = librosa.load(batch["file"], sr=16000)
+    if max_seconds > 0:
+        speech_array = speech_array[: max_seconds * 16000]
+    batch["speech"] = speech_array
+    batch["sampling_rate"] = 16000
+    return batch
+# tokenize
+def inference(audio):
+    # read in sound file
+    # load dummy dataset and read soundfiles
+    sp = speech_file_to_array_fn(audio.name, max_seconds)
+    sample_rate = 16000
+    # stride_length_s is a tuple of the left and right stride length.
+    # With only 1 number, both sides get the same stride, by default
+    # the stride_length on one side is 1/6th of the chunk_length_s
+    input_values = processor(
+        sp["speech"],
+        sample_rate=sample_rate,
+        chunk_length_s=10,
+        stride_length_s=(4, 2),
+        return_tensors="pt",
+    ).input_values
+    with torch.no_grad():
+        logits = model(input_values).logits
+    pred_ids = torch.argmax(logits, axis=-1).cpu().tolist()
+    prediction = tokenizer.decode(pred_ids[0], output_word_offsets=True)
+    time_offset = 320 / sample_rate
+    total_prediction = []
+    words = []
+    for item in prediction.word_offsets:
+        r = item
+        s = round(r['start_offset'] * time_offset, 2)
+        e = round(r['end_offset'] * time_offset, 2)
+        total_prediction.append(f"{s} - {e}: {r['word']}")
+        words.append(r['word'].lower())
+    print(prediction[0])
+    return "\n".join(total_prediction) + "\n\n" + ' '.join(words)
+inputs = gr.inputs.Audio(label="Input Audio", type="file")
+outputs = gr.outputs.Textbox(label="Output Text")
+title = model_name
+description = f"Gradio demo for a {model_name}. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav 16_000hz files, max duration of {max_seconds} sec"
+article = "<p style='text-align: center'><a href='https://github.com/GrosyT/GrosyT.github.io' target='_blank'> Github repo</a> | <a href='<HF Space link>' target='_blank'>Pretrained model</a> </p>"
+examples = [
+    ["sample1.mp3"],
+    ["sample2.mp3"],
+]
+gr.Interface(
+    inference,
+    inputs,
+    outputs,
+    title=title,
+    description=description,
+    article=article,
+    examples=examples,
+).launch()