Update app.py
Browse files
app.py
CHANGED
@@ -4,12 +4,12 @@ Speech Translation Demo with Automatic TTS, Restart Option, and About Tab
|
|
4 |
This demo performs the following:
|
5 |
1. Accepts up to 15 seconds of audio recording from the microphone.
|
6 |
2. Uses OpenAI’s Whisper model to transcribe the speech.
|
7 |
-
3. Splits the transcription into segments and translates each segment
|
8 |
-
on-the-fly using Facebook’s M2M100 model.
|
9 |
4. Streams the cumulative translation output to the user.
|
10 |
5. Automatically converts the final translated text to speech using gTTS.
|
11 |
6. Provides a "Restart Recording" button (located just below the recording section)
|
12 |
to reset the audio input, translated text, and TTS output.
|
|
|
13 |
Note: True real-time translation (i.e. while speaking) requires a continuous streaming
|
14 |
solution which is not provided by the standard browser microphone input.
|
15 |
"""
|
@@ -24,10 +24,8 @@ import uuid
|
|
24 |
# -----------------------------------------------------------------------------
|
25 |
# Global Model Loading
|
26 |
# -----------------------------------------------------------------------------
|
27 |
-
|
28 |
-
whisper_model = whisper.load_model("base") # Adjust model size as needed
|
29 |
|
30 |
-
# Load the M2M100 model and tokenizer for translation.
|
31 |
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
|
32 |
m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
|
33 |
|
@@ -44,50 +42,48 @@ LANGUAGES = {
|
|
44 |
}
|
45 |
|
46 |
# -----------------------------------------------------------------------------
|
47 |
-
# Main Processing Function: Translation
|
48 |
# -----------------------------------------------------------------------------
|
49 |
def translate_audio(audio, target_language):
|
50 |
"""
|
51 |
-
|
52 |
-
|
53 |
"""
|
54 |
if audio is None:
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
# Transcribe the audio using Whisper (fp16=False for CPU compatibility)
|
59 |
result = whisper_model.transcribe(audio, fp16=False)
|
60 |
source_lang = result.get("language", "en")
|
61 |
target_lang_code = LANGUAGES.get(target_language, "en")
|
62 |
-
|
63 |
cumulative_translation = ""
|
64 |
for segment in result.get("segments", []):
|
65 |
segment_text = segment.get("text", "").strip()
|
66 |
if not segment_text:
|
67 |
continue
|
68 |
-
|
69 |
if source_lang == target_lang_code:
|
70 |
translated_segment = segment_text
|
71 |
else:
|
72 |
-
# Set
|
73 |
-
tokenizer.src_lang = source_lang
|
74 |
encoded = tokenizer(segment_text, return_tensors="pt")
|
75 |
generated_tokens = m2m100_model.generate(
|
76 |
**encoded,
|
77 |
forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
|
78 |
)
|
79 |
translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
80 |
-
|
81 |
cumulative_translation += translated_segment + " "
|
82 |
-
|
|
|
83 |
|
84 |
# -----------------------------------------------------------------------------
|
85 |
# TTS Generation Function
|
86 |
# -----------------------------------------------------------------------------
|
87 |
def generate_tts(text, target_language):
|
88 |
"""
|
89 |
-
|
90 |
-
Returns the filename of the generated audio file.
|
91 |
"""
|
92 |
lang_code = LANGUAGES.get(target_language, "en")
|
93 |
if not text or not text.strip():
|
@@ -102,8 +98,7 @@ def generate_tts(text, target_language):
|
|
102 |
# -----------------------------------------------------------------------------
|
103 |
def restart_recording():
|
104 |
"""
|
105 |
-
|
106 |
-
and the TTS audio output.
|
107 |
"""
|
108 |
return None, "", None
|
109 |
|
@@ -112,7 +107,7 @@ def restart_recording():
|
|
112 |
# -----------------------------------------------------------------------------
|
113 |
with gr.Blocks() as demo:
|
114 |
with gr.Tabs():
|
115 |
-
#
|
116 |
with gr.TabItem("Demo"):
|
117 |
gr.Markdown("# Real-time Speech Translation Demo")
|
118 |
gr.Markdown(
|
@@ -121,7 +116,7 @@ with gr.Blocks() as demo:
|
|
121 |
"**Note:** The translation and speech synthesis occur automatically after recording."
|
122 |
)
|
123 |
|
124 |
-
# Row for audio input and
|
125 |
with gr.Row():
|
126 |
audio_input = gr.Audio(
|
127 |
sources=["microphone"],
|
@@ -135,7 +130,7 @@ with gr.Blocks() as demo:
|
|
135 |
label="Select Target Language"
|
136 |
)
|
137 |
|
138 |
-
#
|
139 |
with gr.Row():
|
140 |
restart_button = gr.Button("Restart Recording")
|
141 |
|
@@ -143,28 +138,25 @@ with gr.Blocks() as demo:
|
|
143 |
output_text = gr.Textbox(label="Translated Text", lines=10)
|
144 |
tts_audio = gr.Audio(label="Translated Speech", type="filepath")
|
145 |
|
146 |
-
#
|
147 |
-
# 1. When new audio is recorded, stream the translation text.
|
148 |
-
# 2. Once translation is complete, automatically generate the TTS audio.
|
149 |
audio_input.change(
|
150 |
fn=translate_audio,
|
151 |
inputs=[audio_input, target_lang_dropdown],
|
152 |
-
outputs=output_text
|
153 |
-
stream=True
|
154 |
).then(
|
155 |
fn=generate_tts,
|
156 |
inputs=[output_text, target_lang_dropdown],
|
157 |
outputs=tts_audio
|
158 |
)
|
159 |
|
160 |
-
#
|
161 |
restart_button.click(
|
162 |
fn=restart_recording,
|
163 |
inputs=[],
|
164 |
outputs=[audio_input, output_text, tts_audio]
|
165 |
)
|
166 |
|
167 |
-
#
|
168 |
with gr.TabItem("About"):
|
169 |
gr.Markdown(
|
170 |
"""
|
@@ -182,6 +174,6 @@ This demo performs the following:
|
|
182 |
"""
|
183 |
)
|
184 |
|
185 |
-
# Launch the Gradio app
|
186 |
demo.launch()
|
187 |
|
|
|
4 |
This demo performs the following:
|
5 |
1. Accepts up to 15 seconds of audio recording from the microphone.
|
6 |
2. Uses OpenAI’s Whisper model to transcribe the speech.
|
7 |
+
3. Splits the transcription into segments and translates each segment on-the-fly using Facebook’s M2M100 model.
|
|
|
8 |
4. Streams the cumulative translation output to the user.
|
9 |
5. Automatically converts the final translated text to speech using gTTS.
|
10 |
6. Provides a "Restart Recording" button (located just below the recording section)
|
11 |
to reset the audio input, translated text, and TTS output.
|
12 |
+
|
13 |
Note: True real-time translation (i.e. while speaking) requires a continuous streaming
|
14 |
solution which is not provided by the standard browser microphone input.
|
15 |
"""
|
|
|
24 |
# -----------------------------------------------------------------------------
|
25 |
# Global Model Loading
|
26 |
# -----------------------------------------------------------------------------
|
27 |
+
whisper_model = whisper.load_model("base") # Using "base" for a balance between speed and accuracy
|
|
|
28 |
|
|
|
29 |
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
|
30 |
m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
|
31 |
|
|
|
42 |
}
|
43 |
|
44 |
# -----------------------------------------------------------------------------
|
45 |
+
# Main Processing Function: Translation
|
46 |
# -----------------------------------------------------------------------------
|
47 |
def translate_audio(audio, target_language):
|
48 |
"""
|
49 |
+
Transcribes the input audio using Whisper and translates the text into the target language.
|
50 |
+
Returns the cumulative translated text.
|
51 |
"""
|
52 |
if audio is None:
|
53 |
+
return "No audio provided."
|
54 |
+
|
55 |
+
# Transcribe the audio (using fp16=False for CPU compatibility)
|
|
|
56 |
result = whisper_model.transcribe(audio, fp16=False)
|
57 |
source_lang = result.get("language", "en")
|
58 |
target_lang_code = LANGUAGES.get(target_language, "en")
|
59 |
+
|
60 |
cumulative_translation = ""
|
61 |
for segment in result.get("segments", []):
|
62 |
segment_text = segment.get("text", "").strip()
|
63 |
if not segment_text:
|
64 |
continue
|
65 |
+
|
66 |
if source_lang == target_lang_code:
|
67 |
translated_segment = segment_text
|
68 |
else:
|
69 |
+
tokenizer.src_lang = source_lang # Set source language for proper translation.
|
|
|
70 |
encoded = tokenizer(segment_text, return_tensors="pt")
|
71 |
generated_tokens = m2m100_model.generate(
|
72 |
**encoded,
|
73 |
forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
|
74 |
)
|
75 |
translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
76 |
+
|
77 |
cumulative_translation += translated_segment + " "
|
78 |
+
|
79 |
+
return cumulative_translation.strip()
|
80 |
|
81 |
# -----------------------------------------------------------------------------
|
82 |
# TTS Generation Function
|
83 |
# -----------------------------------------------------------------------------
|
84 |
def generate_tts(text, target_language):
|
85 |
"""
|
86 |
+
Converts the given text to speech using gTTS and returns the filename of the generated audio.
|
|
|
87 |
"""
|
88 |
lang_code = LANGUAGES.get(target_language, "en")
|
89 |
if not text or not text.strip():
|
|
|
98 |
# -----------------------------------------------------------------------------
|
99 |
def restart_recording():
|
100 |
"""
|
101 |
+
Clears the audio input, translated text, and TTS output.
|
|
|
102 |
"""
|
103 |
return None, "", None
|
104 |
|
|
|
107 |
# -----------------------------------------------------------------------------
|
108 |
with gr.Blocks() as demo:
|
109 |
with gr.Tabs():
|
110 |
+
# Demo Tab
|
111 |
with gr.TabItem("Demo"):
|
112 |
gr.Markdown("# Real-time Speech Translation Demo")
|
113 |
gr.Markdown(
|
|
|
116 |
"**Note:** The translation and speech synthesis occur automatically after recording."
|
117 |
)
|
118 |
|
119 |
+
# Row for audio input and language selection.
|
120 |
with gr.Row():
|
121 |
audio_input = gr.Audio(
|
122 |
sources=["microphone"],
|
|
|
130 |
label="Select Target Language"
|
131 |
)
|
132 |
|
133 |
+
# Restart Recording button placed just below the recording section.
|
134 |
with gr.Row():
|
135 |
restart_button = gr.Button("Restart Recording")
|
136 |
|
|
|
138 |
output_text = gr.Textbox(label="Translated Text", lines=10)
|
139 |
tts_audio = gr.Audio(label="Translated Speech", type="filepath")
|
140 |
|
141 |
+
# When audio is recorded, process translation and then generate TTS.
|
|
|
|
|
142 |
audio_input.change(
|
143 |
fn=translate_audio,
|
144 |
inputs=[audio_input, target_lang_dropdown],
|
145 |
+
outputs=output_text
|
|
|
146 |
).then(
|
147 |
fn=generate_tts,
|
148 |
inputs=[output_text, target_lang_dropdown],
|
149 |
outputs=tts_audio
|
150 |
)
|
151 |
|
152 |
+
# Restart button clears all outputs.
|
153 |
restart_button.click(
|
154 |
fn=restart_recording,
|
155 |
inputs=[],
|
156 |
outputs=[audio_input, output_text, tts_audio]
|
157 |
)
|
158 |
|
159 |
+
# About Tab
|
160 |
with gr.TabItem("About"):
|
161 |
gr.Markdown(
|
162 |
"""
|
|
|
174 |
"""
|
175 |
)
|
176 |
|
177 |
+
# Launch the Gradio app.
|
178 |
demo.launch()
|
179 |
|