Grosy commited on
Commit
a485a9b
·
verified ·
1 Parent(s): 130e8c3

Initial version of the app

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
3
+ import librosa
4
+
5
+
6
+ model_name = "Grosy/wav2vec2-base-hu"
7
+
8
+ tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name)
9
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
10
+ model = Wav2Vec2ForCTC.from_pretrained(model_name)
11
+ model.to("cpu")
12
+
13
+ max_seconds = 30
14
+ # define function to read in sound file
15
+ def speech_file_to_array_fn(path, max_seconds=10):
16
+ batch = {"file": path}
17
+ speech_array, sampling_rate = librosa.load(batch["file"], sr=16000)
18
+ if max_seconds > 0:
19
+ speech_array = speech_array[: max_seconds * 16000]
20
+ batch["speech"] = speech_array
21
+ batch["sampling_rate"] = 16000
22
+ return batch
23
+
24
+
25
+ # tokenize
26
+ def inference(audio):
27
+ # read in sound file
28
+ # load dummy dataset and read soundfiles
29
+ sp = speech_file_to_array_fn(audio.name, max_seconds)
30
+
31
+ sample_rate = 16000
32
+ # stride_length_s is a tuple of the left and right stride length.
33
+ # With only 1 number, both sides get the same stride, by default
34
+ # the stride_length on one side is 1/6th of the chunk_length_s
35
+ input_values = processor(
36
+ sp["speech"],
37
+ sample_rate=sample_rate,
38
+ chunk_length_s=10,
39
+ stride_length_s=(4, 2),
40
+ return_tensors="pt",
41
+ ).input_values
42
+
43
+ with torch.no_grad():
44
+ logits = model(input_values).logits
45
+
46
+ pred_ids = torch.argmax(logits, axis=-1).cpu().tolist()
47
+ prediction = tokenizer.decode(pred_ids[0], output_word_offsets=True)
48
+
49
+ time_offset = 320 / sample_rate
50
+
51
+ total_prediction = []
52
+ words = []
53
+ for item in prediction.word_offsets:
54
+ r = item
55
+
56
+ s = round(r['start_offset'] * time_offset, 2)
57
+ e = round(r['end_offset'] * time_offset, 2)
58
+
59
+ total_prediction.append(f"{s} - {e}: {r['word']}")
60
+ words.append(r['word'].lower())
61
+
62
+ print(prediction[0])
63
+
64
+ return "\n".join(total_prediction) + "\n\n" + ' '.join(words)
65
+
66
+
67
+ inputs = gr.inputs.Audio(label="Input Audio", type="file")
68
+ outputs = gr.outputs.Textbox(label="Output Text")
69
+ title = model_name
70
+ description = f"Gradio demo for a {model_name}. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav 16_000hz files, max duration of {max_seconds} sec"
71
+ article = "<p style='text-align: center'><a href='https://github.com/GrosyT/GrosyT.github.io' target='_blank'> Github repo</a> | <a href='<HF Space link>' target='_blank'>Pretrained model</a> </p>"
72
+ examples = [
73
+ ["sample1.mp3"],
74
+ ["sample2.mp3"],
75
+ ]
76
+ gr.Interface(
77
+ inference,
78
+ inputs,
79
+ outputs,
80
+ title=title,
81
+ description=description,
82
+ article=article,
83
+ examples=examples,
84
+ ).launch()