tdurzynski commited on
Commit
5e70a25
·
verified ·
1 Parent(s): 951b505

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -33
app.py CHANGED
@@ -4,12 +4,12 @@ Speech Translation Demo with Automatic TTS, Restart Option, and About Tab
4
  This demo performs the following:
5
  1. Accepts up to 15 seconds of audio recording from the microphone.
6
  2. Uses OpenAI’s Whisper model to transcribe the speech.
7
- 3. Splits the transcription into segments and translates each segment
8
- on-the-fly using Facebook’s M2M100 model.
9
  4. Streams the cumulative translation output to the user.
10
  5. Automatically converts the final translated text to speech using gTTS.
11
  6. Provides a "Restart Recording" button (located just below the recording section)
12
  to reset the audio input, translated text, and TTS output.
 
13
  Note: True real-time translation (i.e. while speaking) requires a continuous streaming
14
  solution which is not provided by the standard browser microphone input.
15
  """
@@ -24,10 +24,8 @@ import uuid
24
  # -----------------------------------------------------------------------------
25
  # Global Model Loading
26
  # -----------------------------------------------------------------------------
27
- # Load the Whisper model (using "base" for a balance between speed and accuracy).
28
- whisper_model = whisper.load_model("base") # Adjust model size as needed
29
 
30
- # Load the M2M100 model and tokenizer for translation.
31
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
32
  m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
33
 
@@ -44,50 +42,48 @@ LANGUAGES = {
44
  }
45
 
46
  # -----------------------------------------------------------------------------
47
- # Main Processing Function: Translation (streaming)
48
  # -----------------------------------------------------------------------------
49
  def translate_audio(audio, target_language):
50
  """
51
- Process the input audio, transcribe it using Whisper, and translate each segment
52
- to the chosen target language. Yields cumulative translation output for streaming.
53
  """
54
  if audio is None:
55
- yield "No audio provided."
56
- return
57
-
58
- # Transcribe the audio using Whisper (fp16=False for CPU compatibility)
59
  result = whisper_model.transcribe(audio, fp16=False)
60
  source_lang = result.get("language", "en")
61
  target_lang_code = LANGUAGES.get(target_language, "en")
62
-
63
  cumulative_translation = ""
64
  for segment in result.get("segments", []):
65
  segment_text = segment.get("text", "").strip()
66
  if not segment_text:
67
  continue
68
-
69
  if source_lang == target_lang_code:
70
  translated_segment = segment_text
71
  else:
72
- # Set the source language for proper translation.
73
- tokenizer.src_lang = source_lang
74
  encoded = tokenizer(segment_text, return_tensors="pt")
75
  generated_tokens = m2m100_model.generate(
76
  **encoded,
77
  forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
78
  )
79
  translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
80
-
81
  cumulative_translation += translated_segment + " "
82
- yield cumulative_translation.strip()
 
83
 
84
  # -----------------------------------------------------------------------------
85
  # TTS Generation Function
86
  # -----------------------------------------------------------------------------
87
  def generate_tts(text, target_language):
88
  """
89
- Convert the translated text to speech using gTTS.
90
- Returns the filename of the generated audio file.
91
  """
92
  lang_code = LANGUAGES.get(target_language, "en")
93
  if not text or not text.strip():
@@ -102,8 +98,7 @@ def generate_tts(text, target_language):
102
  # -----------------------------------------------------------------------------
103
  def restart_recording():
104
  """
105
- Reset the recording section by clearing the audio input, the translation textbox,
106
- and the TTS audio output.
107
  """
108
  return None, "", None
109
 
@@ -112,7 +107,7 @@ def restart_recording():
112
  # -----------------------------------------------------------------------------
113
  with gr.Blocks() as demo:
114
  with gr.Tabs():
115
- # "Demo" Tab: Contains the interactive interface.
116
  with gr.TabItem("Demo"):
117
  gr.Markdown("# Real-time Speech Translation Demo")
118
  gr.Markdown(
@@ -121,7 +116,7 @@ with gr.Blocks() as demo:
121
  "**Note:** The translation and speech synthesis occur automatically after recording."
122
  )
123
 
124
- # Row for audio input and target language selection.
125
  with gr.Row():
126
  audio_input = gr.Audio(
127
  sources=["microphone"],
@@ -135,7 +130,7 @@ with gr.Blocks() as demo:
135
  label="Select Target Language"
136
  )
137
 
138
- # Row for the Restart Recording button (placed just below the recording section).
139
  with gr.Row():
140
  restart_button = gr.Button("Restart Recording")
141
 
@@ -143,28 +138,25 @@ with gr.Blocks() as demo:
143
  output_text = gr.Textbox(label="Translated Text", lines=10)
144
  tts_audio = gr.Audio(label="Translated Speech", type="filepath")
145
 
146
- # Chain the events:
147
- # 1. When new audio is recorded, stream the translation text.
148
- # 2. Once translation is complete, automatically generate the TTS audio.
149
  audio_input.change(
150
  fn=translate_audio,
151
  inputs=[audio_input, target_lang_dropdown],
152
- outputs=output_text,
153
- stream=True
154
  ).then(
155
  fn=generate_tts,
156
  inputs=[output_text, target_lang_dropdown],
157
  outputs=tts_audio
158
  )
159
 
160
- # The Restart button clears the audio input, translation text, and TTS audio.
161
  restart_button.click(
162
  fn=restart_recording,
163
  inputs=[],
164
  outputs=[audio_input, output_text, tts_audio]
165
  )
166
 
167
- # "About" Tab: Displays the descriptive text.
168
  with gr.TabItem("About"):
169
  gr.Markdown(
170
  """
@@ -182,6 +174,6 @@ This demo performs the following:
182
  """
183
  )
184
 
185
- # Launch the Gradio app (suitable for Hugging Face Spaces).
186
  demo.launch()
187
 
 
4
  This demo performs the following:
5
  1. Accepts up to 15 seconds of audio recording from the microphone.
6
  2. Uses OpenAI’s Whisper model to transcribe the speech.
7
+ 3. Splits the transcription into segments and translates each segment on-the-fly using Facebook’s M2M100 model.
 
8
  4. Streams the cumulative translation output to the user.
9
  5. Automatically converts the final translated text to speech using gTTS.
10
  6. Provides a "Restart Recording" button (located just below the recording section)
11
  to reset the audio input, translated text, and TTS output.
12
+
13
  Note: True real-time translation (i.e. while speaking) requires a continuous streaming
14
  solution which is not provided by the standard browser microphone input.
15
  """
 
24
  # -----------------------------------------------------------------------------
25
  # Global Model Loading
26
  # -----------------------------------------------------------------------------
27
+ whisper_model = whisper.load_model("base") # Using "base" for a balance between speed and accuracy
 
28
 
 
29
  tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
30
  m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
31
 
 
42
  }
43
 
44
  # -----------------------------------------------------------------------------
45
+ # Main Processing Function: Translation
46
  # -----------------------------------------------------------------------------
47
  def translate_audio(audio, target_language):
48
  """
49
+ Transcribes the input audio using Whisper and translates the text into the target language.
50
+ Returns the cumulative translated text.
51
  """
52
  if audio is None:
53
+ return "No audio provided."
54
+
55
+ # Transcribe the audio (using fp16=False for CPU compatibility)
 
56
  result = whisper_model.transcribe(audio, fp16=False)
57
  source_lang = result.get("language", "en")
58
  target_lang_code = LANGUAGES.get(target_language, "en")
59
+
60
  cumulative_translation = ""
61
  for segment in result.get("segments", []):
62
  segment_text = segment.get("text", "").strip()
63
  if not segment_text:
64
  continue
65
+
66
  if source_lang == target_lang_code:
67
  translated_segment = segment_text
68
  else:
69
+ tokenizer.src_lang = source_lang # Set source language for proper translation.
 
70
  encoded = tokenizer(segment_text, return_tensors="pt")
71
  generated_tokens = m2m100_model.generate(
72
  **encoded,
73
  forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
74
  )
75
  translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
76
+
77
  cumulative_translation += translated_segment + " "
78
+
79
+ return cumulative_translation.strip()
80
 
81
  # -----------------------------------------------------------------------------
82
  # TTS Generation Function
83
  # -----------------------------------------------------------------------------
84
  def generate_tts(text, target_language):
85
  """
86
+ Converts the given text to speech using gTTS and returns the filename of the generated audio.
 
87
  """
88
  lang_code = LANGUAGES.get(target_language, "en")
89
  if not text or not text.strip():
 
98
  # -----------------------------------------------------------------------------
99
  def restart_recording():
100
  """
101
+ Clears the audio input, translated text, and TTS output.
 
102
  """
103
  return None, "", None
104
 
 
107
  # -----------------------------------------------------------------------------
108
  with gr.Blocks() as demo:
109
  with gr.Tabs():
110
+ # Demo Tab
111
  with gr.TabItem("Demo"):
112
  gr.Markdown("# Real-time Speech Translation Demo")
113
  gr.Markdown(
 
116
  "**Note:** The translation and speech synthesis occur automatically after recording."
117
  )
118
 
119
+ # Row for audio input and language selection.
120
  with gr.Row():
121
  audio_input = gr.Audio(
122
  sources=["microphone"],
 
130
  label="Select Target Language"
131
  )
132
 
133
+ # Restart Recording button placed just below the recording section.
134
  with gr.Row():
135
  restart_button = gr.Button("Restart Recording")
136
 
 
138
  output_text = gr.Textbox(label="Translated Text", lines=10)
139
  tts_audio = gr.Audio(label="Translated Speech", type="filepath")
140
 
141
+ # When audio is recorded, process translation and then generate TTS.
 
 
142
  audio_input.change(
143
  fn=translate_audio,
144
  inputs=[audio_input, target_lang_dropdown],
145
+ outputs=output_text
 
146
  ).then(
147
  fn=generate_tts,
148
  inputs=[output_text, target_lang_dropdown],
149
  outputs=tts_audio
150
  )
151
 
152
+ # Restart button clears all outputs.
153
  restart_button.click(
154
  fn=restart_recording,
155
  inputs=[],
156
  outputs=[audio_input, output_text, tts_audio]
157
  )
158
 
159
+ # About Tab
160
  with gr.TabItem("About"):
161
  gr.Markdown(
162
  """
 
174
  """
175
  )
176
 
177
+ # Launch the Gradio app.
178
  demo.launch()
179