Spaces:
Runtime error
Runtime error
Commit
·
5efea35
1
Parent(s):
4357214
realtime translate
Browse files- app.py +76 -16
- requirements.txt +0 -1
app.py
CHANGED
@@ -176,11 +176,45 @@ def transcribe_en(audio, state_en="", state_vi=""):
|
|
176 |
transcription = eng_tokenizer.decode(predicted_ids[0])
|
177 |
# Output is all upper case
|
178 |
transcription = correct_casing(transcription.lower())
|
179 |
-
state_en += transcription + "
|
180 |
vi_text = translate_en2vi(transcription)
|
181 |
-
state_vi += vi_text + "
|
182 |
return state_en, state_vi
|
183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
"""Gradio demo"""
|
185 |
|
186 |
vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
|
@@ -221,13 +255,26 @@ with gr.Blocks() as demo:
|
|
221 |
inputs=[vi_audio_1])
|
222 |
|
223 |
with gr.TabItem("Vi-En Realtime Translation"):
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
|
233 |
with gr.Tabs():
|
@@ -255,13 +302,26 @@ with gr.Blocks() as demo:
|
|
255 |
inputs=[en_audio_1])
|
256 |
|
257 |
with gr.TabItem("En-Vi Realtime Translation"):
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
if __name__ == "__main__":
|
267 |
demo.launch()
|
|
|
176 |
transcription = eng_tokenizer.decode(predicted_ids[0])
|
177 |
# Output is all upper case
|
178 |
transcription = correct_casing(transcription.lower())
|
179 |
+
state_en += transcription + "+"
|
180 |
vi_text = translate_en2vi(transcription)
|
181 |
+
state_vi += vi_text + "+"
|
182 |
return state_en, state_vi
|
183 |
|
184 |
+
def transcribe_vi_1(audio, state_en=""):
|
185 |
+
ds = speech_file_to_array_fn(audio.name)
|
186 |
+
# infer model
|
187 |
+
input_values = processor(
|
188 |
+
ds["speech"],
|
189 |
+
sampling_rate=ds["sampling_rate"],
|
190 |
+
return_tensors="pt"
|
191 |
+
).input_values
|
192 |
+
# decode ctc output
|
193 |
+
logits = vi_model(input_values).logits[0]
|
194 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
195 |
+
greedy_search_output = processor.decode(pred_ids)
|
196 |
+
beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
|
197 |
+
en_text = translate_vi2en(beam_search_output)
|
198 |
+
state_en += en_text + " "
|
199 |
+
return state_en, state_en
|
200 |
+
|
201 |
+
def transcribe_en_1(audio, state_vi=""):
|
202 |
+
speech = load_data(audio)
|
203 |
+
# Tokenize
|
204 |
+
input_values = eng_tokenizer(speech, return_tensors="pt").input_values
|
205 |
+
# Take logits
|
206 |
+
logits = eng_model(input_values).logits
|
207 |
+
# Take argmax
|
208 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
209 |
+
# Get the words from predicted word ids
|
210 |
+
transcription = eng_tokenizer.decode(predicted_ids[0])
|
211 |
+
# Output is all upper case
|
212 |
+
transcription = correct_casing(transcription.lower())
|
213 |
+
vi_text = translate_en2vi(transcription)
|
214 |
+
state_vi += vi_text + "+"
|
215 |
+
return state_vi, state_vi
|
216 |
+
|
217 |
+
|
218 |
"""Gradio demo"""
|
219 |
|
220 |
vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
|
|
|
255 |
inputs=[vi_audio_1])
|
256 |
|
257 |
with gr.TabItem("Vi-En Realtime Translation"):
|
258 |
+
gr.Interface(
|
259 |
+
fn=transcribe_vi_1,
|
260 |
+
inputs=[
|
261 |
+
gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True),
|
262 |
+
"state",
|
263 |
+
],
|
264 |
+
outputs= [
|
265 |
+
"text",
|
266 |
+
"state",
|
267 |
+
|
268 |
+
],
|
269 |
+
live=True).launch()
|
270 |
+
|
271 |
+
# with gr.Row():
|
272 |
+
# with gr.Column():
|
273 |
+
# vi_audio_2 = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
|
274 |
+
# with gr.Column():
|
275 |
+
# speech2text_vi2 = gr.Textbox(label="Vietnamese Text")
|
276 |
+
# english_out_3 = gr.Textbox(label="English Text")
|
277 |
+
# vi_audio_2.change(transcribe_vi, [vi_audio_2, speech2text_vi2, english_out_3], [speech2text_vi2, english_out_3])
|
278 |
|
279 |
|
280 |
with gr.Tabs():
|
|
|
302 |
inputs=[en_audio_1])
|
303 |
|
304 |
with gr.TabItem("En-Vi Realtime Translation"):
|
305 |
+
gr.Interface(
|
306 |
+
fn=transcribe_en_1,
|
307 |
+
inputs=[
|
308 |
+
gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True),
|
309 |
+
"state",
|
310 |
+
],
|
311 |
+
outputs= [
|
312 |
+
"text",
|
313 |
+
"state",
|
314 |
+
|
315 |
+
],
|
316 |
+
live=True).launch()
|
317 |
+
|
318 |
+
# with gr.Row():
|
319 |
+
# with gr.Column():
|
320 |
+
# en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
|
321 |
+
# with gr.Column():
|
322 |
+
# speech2text_en2 = gr.Textbox(label="English Text")
|
323 |
+
# vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
|
324 |
+
# en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
|
325 |
|
326 |
if __name__ == "__main__":
|
327 |
demo.launch()
|
requirements.txt
CHANGED
@@ -12,6 +12,5 @@ ffmpeg-python
|
|
12 |
gradio
|
13 |
nltk
|
14 |
librosa
|
15 |
-
transformers
|
16 |
transformers[sentencepiece]
|
17 |
https://github.com/kpu/kenlm/archive/master.zip
|
|
|
12 |
gradio
|
13 |
nltk
|
14 |
librosa
|
|
|
15 |
transformers[sentencepiece]
|
16 |
https://github.com/kpu/kenlm/archive/master.zip
|