datnth1709 commited on
Commit
5efea35
·
1 Parent(s): 4357214

realtime translate

Browse files
Files changed (2) hide show
  1. app.py +76 -16
  2. requirements.txt +0 -1
app.py CHANGED
@@ -176,11 +176,45 @@ def transcribe_en(audio, state_en="", state_vi=""):
176
  transcription = eng_tokenizer.decode(predicted_ids[0])
177
  # Output is all upper case
178
  transcription = correct_casing(transcription.lower())
179
- state_en += transcription + " "
180
  vi_text = translate_en2vi(transcription)
181
- state_vi += vi_text + " "
182
  return state_en, state_vi
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  """Gradio demo"""
185
 
186
  vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
@@ -221,13 +255,26 @@ with gr.Blocks() as demo:
221
  inputs=[vi_audio_1])
222
 
223
  with gr.TabItem("Vi-En Realtime Translation"):
224
- with gr.Row():
225
- with gr.Column():
226
- vi_audio_2 = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
227
- with gr.Column():
228
- speech2text_vi2 = gr.Textbox(label="Vietnamese Text")
229
- english_out_3 = gr.Textbox(label="English Text")
230
- vi_audio_2.change(transcribe_vi, [vi_audio_2, speech2text_vi2, english_out_3], [speech2text_vi2, english_out_3])
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
 
233
  with gr.Tabs():
@@ -255,13 +302,26 @@ with gr.Blocks() as demo:
255
  inputs=[en_audio_1])
256
 
257
  with gr.TabItem("En-Vi Realtime Translation"):
258
- with gr.Row():
259
- with gr.Column():
260
- en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
261
- with gr.Column():
262
- speech2text_en2 = gr.Textbox(label="English Text")
263
- vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
264
- en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
  if __name__ == "__main__":
267
  demo.launch()
 
176
  transcription = eng_tokenizer.decode(predicted_ids[0])
177
  # Output is all upper case
178
  transcription = correct_casing(transcription.lower())
179
+ state_en += transcription + "+"
180
  vi_text = translate_en2vi(transcription)
181
+ state_vi += vi_text + "+"
182
  return state_en, state_vi
183
 
184
+ def transcribe_vi_1(audio, state_en=""):
185
+ ds = speech_file_to_array_fn(audio.name)
186
+ # infer model
187
+ input_values = processor(
188
+ ds["speech"],
189
+ sampling_rate=ds["sampling_rate"],
190
+ return_tensors="pt"
191
+ ).input_values
192
+ # decode ctc output
193
+ logits = vi_model(input_values).logits[0]
194
+ pred_ids = torch.argmax(logits, dim=-1)
195
+ greedy_search_output = processor.decode(pred_ids)
196
+ beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
197
+ en_text = translate_vi2en(beam_search_output)
198
+ state_en += en_text + " "
199
+ return state_en, state_en
200
+
201
+ def transcribe_en_1(audio, state_vi=""):
202
+ speech = load_data(audio)
203
+ # Tokenize
204
+ input_values = eng_tokenizer(speech, return_tensors="pt").input_values
205
+ # Take logits
206
+ logits = eng_model(input_values).logits
207
+ # Take argmax
208
+ predicted_ids = torch.argmax(logits, dim=-1)
209
+ # Get the words from predicted word ids
210
+ transcription = eng_tokenizer.decode(predicted_ids[0])
211
+ # Output is all upper case
212
+ transcription = correct_casing(transcription.lower())
213
+ vi_text = translate_en2vi(transcription)
214
+ state_vi += vi_text + "+"
215
+ return state_vi, state_vi
216
+
217
+
218
  """Gradio demo"""
219
 
220
  vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
 
255
  inputs=[vi_audio_1])
256
 
257
  with gr.TabItem("Vi-En Realtime Translation"):
258
+ gr.Interface(
259
+ fn=transcribe_vi_1,
260
+ inputs=[
261
+ gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True),
262
+ "state",
263
+ ],
264
+ outputs= [
265
+ "text",
266
+ "state",
267
+
268
+ ],
269
+ live=True).launch()
270
+
271
+ # with gr.Row():
272
+ # with gr.Column():
273
+ # vi_audio_2 = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
274
+ # with gr.Column():
275
+ # speech2text_vi2 = gr.Textbox(label="Vietnamese Text")
276
+ # english_out_3 = gr.Textbox(label="English Text")
277
+ # vi_audio_2.change(transcribe_vi, [vi_audio_2, speech2text_vi2, english_out_3], [speech2text_vi2, english_out_3])
278
 
279
 
280
  with gr.Tabs():
 
302
  inputs=[en_audio_1])
303
 
304
  with gr.TabItem("En-Vi Realtime Translation"):
305
+ gr.Interface(
306
+ fn=transcribe_en_1,
307
+ inputs=[
308
+ gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True),
309
+ "state",
310
+ ],
311
+ outputs= [
312
+ "text",
313
+ "state",
314
+
315
+ ],
316
+ live=True).launch()
317
+
318
+ # with gr.Row():
319
+ # with gr.Column():
320
+ # en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
321
+ # with gr.Column():
322
+ # speech2text_en2 = gr.Textbox(label="English Text")
323
+ # vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
324
+ # en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
325
 
326
  if __name__ == "__main__":
327
  demo.launch()
requirements.txt CHANGED
@@ -12,6 +12,5 @@ ffmpeg-python
12
  gradio
13
  nltk
14
  librosa
15
- transformers
16
  transformers[sentencepiece]
17
  https://github.com/kpu/kenlm/archive/master.zip
 
12
  gradio
13
  nltk
14
  librosa
 
15
  transformers[sentencepiece]
16
  https://github.com/kpu/kenlm/archive/master.zip