Initial version of the app
Browse files
app.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
|
3 |
+
import librosa
|
4 |
+
|
5 |
+
|
6 |
+
model_name = "Grosy/wav2vec2-base-hu"
|
7 |
+
|
8 |
+
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name)
|
9 |
+
processor = Wav2Vec2Processor.from_pretrained(model_name)
|
10 |
+
model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
11 |
+
model.to("cpu")
|
12 |
+
|
13 |
+
max_seconds = 30
|
14 |
+
# define function to read in sound file
|
15 |
+
def speech_file_to_array_fn(path, max_seconds=10):
|
16 |
+
batch = {"file": path}
|
17 |
+
speech_array, sampling_rate = librosa.load(batch["file"], sr=16000)
|
18 |
+
if max_seconds > 0:
|
19 |
+
speech_array = speech_array[: max_seconds * 16000]
|
20 |
+
batch["speech"] = speech_array
|
21 |
+
batch["sampling_rate"] = 16000
|
22 |
+
return batch
|
23 |
+
|
24 |
+
|
25 |
+
# tokenize
|
26 |
+
def inference(audio):
|
27 |
+
# read in sound file
|
28 |
+
# load dummy dataset and read soundfiles
|
29 |
+
sp = speech_file_to_array_fn(audio.name, max_seconds)
|
30 |
+
|
31 |
+
sample_rate = 16000
|
32 |
+
# stride_length_s is a tuple of the left and right stride length.
|
33 |
+
# With only 1 number, both sides get the same stride, by default
|
34 |
+
# the stride_length on one side is 1/6th of the chunk_length_s
|
35 |
+
input_values = processor(
|
36 |
+
sp["speech"],
|
37 |
+
sample_rate=sample_rate,
|
38 |
+
chunk_length_s=10,
|
39 |
+
stride_length_s=(4, 2),
|
40 |
+
return_tensors="pt",
|
41 |
+
).input_values
|
42 |
+
|
43 |
+
with torch.no_grad():
|
44 |
+
logits = model(input_values).logits
|
45 |
+
|
46 |
+
pred_ids = torch.argmax(logits, axis=-1).cpu().tolist()
|
47 |
+
prediction = tokenizer.decode(pred_ids[0], output_word_offsets=True)
|
48 |
+
|
49 |
+
time_offset = 320 / sample_rate
|
50 |
+
|
51 |
+
total_prediction = []
|
52 |
+
words = []
|
53 |
+
for item in prediction.word_offsets:
|
54 |
+
r = item
|
55 |
+
|
56 |
+
s = round(r['start_offset'] * time_offset, 2)
|
57 |
+
e = round(r['end_offset'] * time_offset, 2)
|
58 |
+
|
59 |
+
total_prediction.append(f"{s} - {e}: {r['word']}")
|
60 |
+
words.append(r['word'].lower())
|
61 |
+
|
62 |
+
print(prediction[0])
|
63 |
+
|
64 |
+
return "\n".join(total_prediction) + "\n\n" + ' '.join(words)
|
65 |
+
|
66 |
+
|
67 |
+
inputs = gr.inputs.Audio(label="Input Audio", type="file")
|
68 |
+
outputs = gr.outputs.Textbox(label="Output Text")
|
69 |
+
title = model_name
|
70 |
+
description = f"Gradio demo for a {model_name}. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav 16_000hz files, max duration of {max_seconds} sec"
|
71 |
+
article = "<p style='text-align: center'><a href='https://github.com/GrosyT/GrosyT.github.io' target='_blank'> Github repo</a> | <a href='<HF Space link>' target='_blank'>Pretrained model</a> </p>"
|
72 |
+
examples = [
|
73 |
+
["sample1.mp3"],
|
74 |
+
["sample2.mp3"],
|
75 |
+
]
|
76 |
+
gr.Interface(
|
77 |
+
inference,
|
78 |
+
inputs,
|
79 |
+
outputs,
|
80 |
+
title=title,
|
81 |
+
description=description,
|
82 |
+
article=article,
|
83 |
+
examples=examples,
|
84 |
+
).launch()
|