File size: 7,461 Bytes
b4d8745 951b505 b4d8745 5351689 b4d8745 5e70a25 5351689 1dc3846 5e70a25 4e7d1a7 b4d8745 4e7d1a7 b4d8745 5e70a25 b4d8745 4e7d1a7 b4d8745 5351689 b4d8745 5e70a25 b4d8745 5e70a25 b4d8745 5e70a25 b4d8745 5e70a25 b4d8745 1dc3846 b4d8745 5e70a25 b4d8745 5e70a25 b4d8745 5e70a25 b4d8745 5e70a25 b4d8745 4e7d1a7 5e70a25 4e7d1a7 b4d8745 1dc3846 5e70a25 1dc3846 951b505 b4d8745 951b505 5e70a25 951b505 99cdbb0 951b505 5e70a25 951b505 5e70a25 951b505 5e70a25 951b505 5e70a25 951b505 5e70a25 951b505 5e70a25 951b505 b4d8745 5e70a25 b4d8745 951b505 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
"""
Speech Translation Demo with Automatic TTS, Restart Option, and About Tab
This demo performs the following:
1. Accepts up to 15 seconds of audio recording from the microphone.
2. Uses OpenAI’s Whisper model to transcribe the speech.
3. Splits the transcription into segments and translates each segment on-the-fly using Facebook’s M2M100 model.
4. Streams the cumulative translation output to the user.
5. Automatically converts the final translated text to speech using gTTS.
6. Provides a "Restart Recording" button (located just below the recording section)
to reset the audio input, translated text, and TTS output.
Note: True real-time translation (i.e. while speaking) requires a continuous streaming
solution which is not provided by the standard browser microphone input.
"""
import gradio as gr
import whisper
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from gtts import gTTS
import uuid
# -----------------------------------------------------------------------------
# Global Model Loading
# -----------------------------------------------------------------------------
whisper_model = whisper.load_model("base") # Using "base" for a balance between speed and accuracy
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
# -----------------------------------------------------------------------------
# Define Supported Languages (including Polish)
# -----------------------------------------------------------------------------
LANGUAGES = {
"English": "en",
"Spanish": "es",
"French": "fr",
"German": "de",
"Chinese": "zh",
"Polish": "pl"
}
# -----------------------------------------------------------------------------
# Main Processing Function: Translation
# -----------------------------------------------------------------------------
def translate_audio(audio, target_language):
"""
Transcribes the input audio using Whisper and translates the text into the target language.
Returns the cumulative translated text.
"""
if audio is None:
return "No audio provided."
# Transcribe the audio (using fp16=False for CPU compatibility)
result = whisper_model.transcribe(audio, fp16=False)
source_lang = result.get("language", "en")
target_lang_code = LANGUAGES.get(target_language, "en")
cumulative_translation = ""
for segment in result.get("segments", []):
segment_text = segment.get("text", "").strip()
if not segment_text:
continue
if source_lang == target_lang_code:
translated_segment = segment_text
else:
tokenizer.src_lang = source_lang # Set source language for proper translation.
encoded = tokenizer(segment_text, return_tensors="pt")
generated_tokens = m2m100_model.generate(
**encoded,
forced_bos_token_id=tokenizer.get_lang_id(target_lang_code)
)
translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
cumulative_translation += translated_segment + " "
return cumulative_translation.strip()
# -----------------------------------------------------------------------------
# TTS Generation Function
# -----------------------------------------------------------------------------
def generate_tts(text, target_language):
"""
Converts the given text to speech using gTTS and returns the filename of the generated audio.
"""
lang_code = LANGUAGES.get(target_language, "en")
if not text or not text.strip():
return None
filename = f"tts_{uuid.uuid4().hex}.mp3"
tts = gTTS(text=text, lang=lang_code)
tts.save(filename)
return filename
# -----------------------------------------------------------------------------
# Restart Function
# -----------------------------------------------------------------------------
def restart_recording():
"""
Clears the audio input, translated text, and TTS output.
"""
return None, "", None
# -----------------------------------------------------------------------------
# Gradio Interface Definition with Tabs
# -----------------------------------------------------------------------------
with gr.Blocks() as demo:
with gr.Tabs():
# Demo Tab
with gr.TabItem("Demo"):
gr.Markdown("# Speech Translation Demo")
gr.Markdown(
"Speak into the microphone and your speech will be transcribed and translated "
"segment-by-segment. (Recording is limited to 15 seconds.)\n\n"
"**Note:** The translation and speech synthesis occur automatically after recording."
)
# Row for audio input and language selection.
with gr.Row():
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Record your speech (max 15 seconds)",
elem_id="audio_input"
)
target_lang_dropdown = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="English",
label="Select Target Language"
)
# Restart Recording button placed just below the recording section.
with gr.Row():
restart_button = gr.Button("Restart Recording")
# Output components: Translated text and TTS audio.
output_text = gr.Textbox(label="Translated Text", lines=10)
tts_audio = gr.Audio(label="Translated Speech", type="filepath")
# When audio is recorded, process translation and then generate TTS.
audio_input.change(
fn=translate_audio,
inputs=[audio_input, target_lang_dropdown],
outputs=output_text
).then(
fn=generate_tts,
inputs=[output_text, target_lang_dropdown],
outputs=tts_audio
)
# Restart button clears all outputs.
restart_button.click(
fn=restart_recording,
inputs=[],
outputs=[audio_input, output_text, tts_audio]
)
# About Tab
with gr.TabItem("About"):
gr.Markdown(
"""
**Speech Translation Demo with Automatic TTS and Restart Option**
This demo performs the following:
1. Accepts up to 15 seconds of audio recording from the microphone.
2. Uses OpenAI’s Whisper model to transcribe the speech.
3. Splits the transcription into segments and translates each segment on-the-fly using Facebook’s M2M100 model.
4. Streams the cumulative translation output to the user.
5. Automatically converts the final translated text to speech using gTTS.
6. Provides a "Restart Recording" button (located just below the recording section) to reset the audio input, translated text, and TTS output.
**Note:** True real-time translation (i.e. while speaking) requires a continuous streaming solution which is not provided by the standard browser microphone input.
"""
)
# Launch the Gradio app.
demo.launch()
|