Spaces:

datnth1709
/

FantasticFour-S2T-MT-demo

Runtime error

App Files Files Community

datnth1709 commited on Sep 23, 2022

Commit

5efea35

1 Parent(s): 4357214

realtime translate

Browse files

Files changed (2) hide show

app.py +76 -16
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -176,11 +176,45 @@ def transcribe_en(audio, state_en="", state_vi=""):
     transcription = eng_tokenizer.decode(predicted_ids[0])
     # Output is all upper case
     transcription = correct_casing(transcription.lower())
-    state_en += transcription + " "
     vi_text = translate_en2vi(transcription)
-    state_vi += vi_text + " "
     return state_en, state_vi
 """Gradio demo"""
 vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
@@ -221,13 +255,26 @@ with gr.Blocks() as demo:
                         inputs=[vi_audio_1])
         with gr.TabItem("Vi-En Realtime Translation"):
-            with gr.Row():
-                with gr.Column():
-                    vi_audio_2 = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
-                with gr.Column():
-                    speech2text_vi2 = gr.Textbox(label="Vietnamese Text")
-                    english_out_3 = gr.Textbox(label="English Text")
-            vi_audio_2.change(transcribe_vi, [vi_audio_2, speech2text_vi2, english_out_3], [speech2text_vi2, english_out_3])
     with gr.Tabs():
@@ -255,13 +302,26 @@ with gr.Blocks() as demo:
                         inputs=[en_audio_1])
         with gr.TabItem("En-Vi Realtime Translation"):
-            with gr.Row():
-                with gr.Column():
-                    en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
-                with gr.Column():
-                    speech2text_en2 = gr.Textbox(label="English Text")
-                    vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
-            en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
 if __name__ == "__main__":
     demo.launch()

     transcription = eng_tokenizer.decode(predicted_ids[0])
     # Output is all upper case
     transcription = correct_casing(transcription.lower())
+    state_en += transcription + "+"
     vi_text = translate_en2vi(transcription)
+    state_vi += vi_text + "+"
     return state_en, state_vi
+def transcribe_vi_1(audio, state_en=""):
+    ds = speech_file_to_array_fn(audio.name)
+    # infer model
+    input_values = processor(
+          ds["speech"],
+          sampling_rate=ds["sampling_rate"],
+          return_tensors="pt"
+    ).input_values
+    # decode ctc output
+    logits = vi_model(input_values).logits[0]
+    pred_ids = torch.argmax(logits, dim=-1)
+    greedy_search_output = processor.decode(pred_ids)
+    beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
+    en_text = translate_vi2en(beam_search_output)
+    state_en += en_text + " "
+    return state_en, state_en
+def transcribe_en_1(audio, state_vi=""):
+    speech = load_data(audio)
+    # Tokenize
+    input_values = eng_tokenizer(speech, return_tensors="pt").input_values
+    # Take logits
+    logits = eng_model(input_values).logits
+    # Take argmax
+    predicted_ids = torch.argmax(logits, dim=-1)
+    # Get the words from predicted word ids
+    transcription = eng_tokenizer.decode(predicted_ids[0])
+    # Output is all upper case
+    transcription = correct_casing(transcription.lower())
+    vi_text = translate_en2vi(transcription)
+    state_vi += vi_text + "+"
+    return state_vi, state_vi
 """Gradio demo"""
 vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
                         inputs=[vi_audio_1])
         with gr.TabItem("Vi-En Realtime Translation"):
+            gr.Interface(
+                    fn=transcribe_vi_1,
+                    inputs=[
+                        gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True),
+                        "state",
+                    ],
+                    outputs= [
+                        "text",
+                        "state",
+                    ],
+                    live=True).launch()
+            # with gr.Row():
+            #     with gr.Column():
+            #         vi_audio_2 = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
+            #     with gr.Column():
+            #         speech2text_vi2 = gr.Textbox(label="Vietnamese Text")
+            #         english_out_3 = gr.Textbox(label="English Text")
+            # vi_audio_2.change(transcribe_vi, [vi_audio_2, speech2text_vi2, english_out_3], [speech2text_vi2, english_out_3])
     with gr.Tabs():
                         inputs=[en_audio_1])
         with gr.TabItem("En-Vi Realtime Translation"):
+            gr.Interface(
+                    fn=transcribe_en_1,
+                    inputs=[
+                        gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True),
+                        "state",
+                    ],
+                    outputs= [
+                        "text",
+                        "state",
+                    ],
+                    live=True).launch()
+            # with gr.Row():
+            #     with gr.Column():
+            #         en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
+            #     with gr.Column():
+            #         speech2text_en2 = gr.Textbox(label="English Text")
+            #         vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
+            # en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -12,6 +12,5 @@ ffmpeg-python
 gradio
 nltk
 librosa
-transformers
 transformers[sentencepiece]
 https://github.com/kpu/kenlm/archive/master.zip

 gradio
 nltk
 librosa
 transformers[sentencepiece]
 https://github.com/kpu/kenlm/archive/master.zip