Spaces:

DroolingPanda
/

teachingAssistant

Running

App Files Files Community

Michael Hu commited on 2 days ago

Commit

f0248ed

1 Parent(s): 4a9bb1a

move to Gradio so we can leverage ZeroGPU

Browse files

Files changed (4) hide show

app.py +172 -128
app_gradio.py +237 -0
requirements.txt +5 -2
utils/tts_dia.py +2 -1

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
-"""
-Main entry point for the Audio Translation Web Application
 Handles file upload, processing pipeline, and UI rendering
 """
@@ -14,180 +13,225 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-import streamlit as st
 import os
 import time
-import subprocess
 from utils.stt import transcribe_audio
 from utils.translation import translate_text
-from utils.tts import get_tts_engine, generate_speech
 # Initialize environment configurations
 os.makedirs("temp/uploads", exist_ok=True)
 os.makedirs("temp/outputs", exist_ok=True)
-def configure_page():
-    """Set up Streamlit page configuration"""
-    logger.info("Configuring Streamlit page")
-    st.set_page_config(
-        page_title="Audio Translator",
-        page_icon="🎧",
-        layout="wide",
-        initial_sidebar_state="expanded"
-    )
-    st.markdown("""
-        <style>
-            .reportview-container {margin-top: -2em;}
-            #MainMenu {visibility: hidden;}
-            .stDeployButton {display:none;}
-            .stAlert {padding: 20px !important;}
-        </style>
-    """, unsafe_allow_html=True)
-def handle_file_processing(upload_path):
     """
     Execute the complete processing pipeline:
     1. Speech-to-Text (STT)
     2. Machine Translation
     3. Text-to-Speech (TTS)
     """
-    logger.info(f"Starting processing for: {upload_path}")
-    progress_bar = st.progress(0)
-    status_text = st.empty()
     try:
         # STT Phase
         logger.info("Beginning STT processing")
-        status_text.markdown("🔍 **Performing Speech Recognition...**")
-        with st.spinner("Initializing Whisper model..."):
-            english_text = transcribe_audio(upload_path)
-        progress_bar.progress(30)
         logger.info(f"STT completed. Text length: {len(english_text)} characters")
         # Translation Phase
         logger.info("Beginning translation")
-        status_text.markdown("🌐 **Translating Content...**")
-        with st.spinner("Loading translation model..."):
-            chinese_text = translate_text(english_text)
-        progress_bar.progress(60)
         logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters")
         # TTS Phase
         logger.info("Beginning TTS generation")
-        status_text.markdown("🎵 **Generating Chinese Speech...**")
         # Initialize TTS engine with appropriate language code for Chinese
         engine = get_tts_engine(lang_code='z')  # 'z' for Mandarin Chinese
         # Generate speech and get the file path
         output_path = engine.generate_speech(chinese_text, voice="zf_xiaobei")
-        progress_bar.progress(100)
         logger.info(f"TTS completed. Output file: {output_path}")
-        # Store the text for streaming playback
-        st.session_state.current_text = chinese_text
-        status_text.success("✅ Processing Complete!")
-        return english_text, chinese_text, output_path
     except Exception as e:
         logger.error(f"Processing failed: {str(e)}", exc_info=True)
-        status_text.error(f"❌ Processing Failed: {str(e)}")
-        st.exception(e)
-        raise
-def render_results(english_text, chinese_text, output_path):
-    """Display processing results in organized columns"""
-    logger.info("Rendering results")
-    st.divider()
-    col1, col2 = st.columns([2, 1])
-    with col1:
-        st.subheader("Recognition Results")
-        st.code(english_text, language="text")
-        st.subheader("Translation Results")
-        st.code(chinese_text, language="text")
-    with col2:
-        st.subheader("Audio Output")
-        # Standard audio player for the full file
-        st.audio(output_path)
-        # Download button
-        with open(output_path, "rb") as f:
-            st.download_button(
-                label="Download Audio",
-                data=f,
-                file_name="translated_audio.wav",
-                mime="audio/wav"
-            )
-        # Streaming playback controls
-        st.subheader("Streaming Playback")
-        if st.button("Stream Audio"):
-            engine = get_tts_engine(lang_code='z')
-            streaming_placeholder = st.empty()
-            # Stream the audio in chunks
-            for sample_rate, audio_chunk in engine.generate_speech_stream(
-                chinese_text,
-                voice="zf_xiaobei"
-            ):
-                # Create a temporary file for each chunk
-                temp_chunk_path = f"temp/outputs/chunk_{time.time()}.wav"
-                import soundfile as sf
-                sf.write(temp_chunk_path, audio_chunk, sample_rate)
-                # Play the chunk
-                with streaming_placeholder:
-                    st.audio(temp_chunk_path, sample_rate=sample_rate)
-                # Clean up the temporary chunk file
-                os.remove(temp_chunk_path)
-def initialize_session_state():
-    """Initialize session state variables"""
-    if 'current_text' not in st.session_state:
-        st.session_state.current_text = None
-def main():
-    """Main application workflow"""
-    logger.info("Starting application")
-    configure_page()
-    initialize_session_state()
-    st.title("🎧 High-Quality Audio Translation System")
-    st.markdown("Upload English Audio → Get Chinese Speech Output")
-    # Voice selection in sidebar
-    st.sidebar.header("TTS Settings")
-    voice_options = {
-        "Xiaobei (Female)": "zf_xiaobei",
-        "Yunjian (Male)": "zm_yunjian",
-    }
-    selected_voice = st.sidebar.selectbox(
-        "Select Voice",
-        list(voice_options.keys()),
-        format_func=lambda x: x
-    )
-    speed = st.sidebar.slider("Speech Speed", 0.5, 2.0, 1.0, 0.1)
-    uploaded_file = st.file_uploader(
-        "Select Audio File (MP3/WAV)",
-        type=["mp3", "wav"],
-        accept_multiple_files=False
-    )
-    if uploaded_file:
-        logger.info(f"File uploaded: {uploaded_file.name}")
-        upload_path = os.path.join("temp/uploads", uploaded_file.name)
-        with open(upload_path, "wb") as f:
-            f.write(uploaded_file.getbuffer())
-        results = handle_file_processing(upload_path)
-        if results:
-            render_results(*results)
 if __name__ == "__main__":
     main()

+"""Main entry point for the Audio Translation Web Application using Gradio
 Handles file upload, processing pipeline, and UI rendering
 """
 )
 logger = logging.getLogger(__name__)
+import gradio as gr
 import os
 import time
+import numpy as np
+import soundfile as sf
 from utils.stt import transcribe_audio
 from utils.translation import translate_text
+from utils.tts import get_tts_engine
 # Initialize environment configurations
 os.makedirs("temp/uploads", exist_ok=True)
 os.makedirs("temp/outputs", exist_ok=True)
+# CSS for styling the Gradio interface
+css = """
+.gradio-container {
+    max-width: 1200px;
+    margin: 0 auto;
+}
+.output-text {
+    font-family: monospace;
+    padding: 10px;
+    background-color: #f5f5f5;
+    border-radius: 4px;
+}
+"""
+def handle_file_processing(audio_file):
     """
     Execute the complete processing pipeline:
     1. Speech-to-Text (STT)
     2. Machine Translation
     3. Text-to-Speech (TTS)
+    Args:
+        audio_file: Tuple containing (sample_rate, audio_data)
+    Returns:
+        Tuple containing (english_text, chinese_text, output_audio)
     """
+    logger.info("Starting processing for uploaded audio")
     try:
+        # Save the uploaded audio to a temporary file
+        sr, audio_data = audio_file
+        temp_path = os.path.join("temp/uploads", f"upload_{time.time()}.wav")
+        sf.write(temp_path, audio_data, sr)
+        logger.info(f"Saved uploaded audio to {temp_path}")
         # STT Phase
         logger.info("Beginning STT processing")
+        english_text = transcribe_audio(temp_path)
         logger.info(f"STT completed. Text length: {len(english_text)} characters")
         # Translation Phase
         logger.info("Beginning translation")
+        chinese_text = translate_text(english_text)
         logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters")
         # TTS Phase
         logger.info("Beginning TTS generation")
         # Initialize TTS engine with appropriate language code for Chinese
         engine = get_tts_engine(lang_code='z')  # 'z' for Mandarin Chinese
         # Generate speech and get the file path
         output_path = engine.generate_speech(chinese_text, voice="zf_xiaobei")
         logger.info(f"TTS completed. Output file: {output_path}")
+        # Load the generated audio for Gradio output
+        audio_data, sr = sf.read(output_path)
+        return english_text, chinese_text, (sr, audio_data)
     except Exception as e:
         logger.error(f"Processing failed: {str(e)}", exc_info=True)
+        raise gr.Error(f"Processing Failed: {str(e)}")
+def stream_audio(chinese_text, voice, speed):
+    """
+    Stream audio in chunks for the Gradio interface
+    Args:
+        chinese_text: The Chinese text to convert to speech
+        voice: The voice to use
+        speed: The speech speed factor
+    Returns:
+        Generator yielding audio chunks
+    """
+    engine = get_tts_engine(lang_code='z')
+    # Stream the audio in chunks
+    for sample_rate, audio_chunk in engine.generate_speech_stream(
+        chinese_text,
+        voice=voice,
+        speed=speed
+    ):
+        # Create a temporary file for each chunk
+        temp_chunk_path = f"temp/outputs/chunk_{time.time()}.wav"
+        sf.write(temp_chunk_path, audio_chunk, sample_rate)
+        # Load the chunk for Gradio output
+        chunk_data, sr = sf.read(temp_chunk_path)
+        # Clean up the temporary chunk file
+        os.remove(temp_chunk_path)
+        yield (sr, chunk_data)
+def create_interface():
+    """
+    Create and configure the Gradio interface
+    Returns:
+        Gradio Blocks interface
+    """
+    with gr.Blocks(css=css) as interface:
+        gr.Markdown("# 🎧 High-Quality Audio Translation System")
+        gr.Markdown("Upload English Audio → Get Chinese Speech Output")
+        with gr.Row():
+            with gr.Column(scale=2):
+                # File upload component
+                audio_input = gr.Audio(
+                    label="Upload English Audio",
+                    type="numpy",
+                    sources=["upload", "microphone"]
+                )
+                # Process button
+                process_btn = gr.Button("Process Audio", variant="primary")
+            with gr.Column(scale=1):
+                # TTS Settings
+                with gr.Box():
+                    gr.Markdown("### TTS Settings")
+                    voice_dropdown = gr.Dropdown(
+                        choices=["Xiaobei (Female)", "Yunjian (Male)"],
+                        value="Xiaobei (Female)",
+                        label="Select Voice"
+                    )
+                    speed_slider = gr.Slider(
+                        minimum=0.5,
+                        maximum=2.0,
+                        value=1.0,
+                        step=0.1,
+                        label="Speech Speed"
+                    )
+        # Output section
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Text outputs
+                english_output = gr.Textbox(
+                    label="Recognition Results",
+                    lines=5,
+                    elem_classes=["output-text"]
+                )
+                chinese_output = gr.Textbox(
+                    label="Translation Results",
+                    lines=5,
+                    elem_classes=["output-text"]
+                )
+            with gr.Column(scale=1):
+                # Audio output
+                audio_output = gr.Audio(
+                    label="Audio Output",
+                    type="numpy"
+                )
+                # Stream button
+                stream_btn = gr.Button("Stream Audio")
+                # Download button is automatically provided by gr.Audio
+        # Set up event handlers
+        process_btn.click(
+            fn=handle_file_processing,
+            inputs=[audio_input],
+            outputs=[english_output, chinese_output, audio_output]
+        )
+        # Map voice selection to actual voice IDs
+        def get_voice_id(voice_name):
+            voice_map = {
+                "Xiaobei (Female)": "zf_xiaobei",
+                "Yunjian (Male)": "zm_yunjian"
+            }
+            return voice_map.get(voice_name, "zf_xiaobei")
+        # Stream button handler
+        stream_btn.click(
+            fn=lambda text, voice, speed: stream_audio(text, get_voice_id(voice), speed),
+            inputs=[chinese_output, voice_dropdown, speed_slider],
+            outputs=audio_output
+        )
+        # Examples
+        gr.Examples(
+            examples=[
+                ["examples/sample1.mp3"],
+                ["examples/sample2.wav"]
+            ],
+            inputs=audio_input
+        )
+    return interface
+def main():
+    """
+    Main application entry point
+    """
+    logger.info("Starting Gradio application")
+    interface = create_interface()
+    interface.launch()
 if __name__ == "__main__":
     main()

app_gradio.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""Main entry point for the Audio Translation Web Application using Gradio
+Handles file upload, processing pipeline, and UI rendering
+"""
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler("app.log"),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+import gradio as gr
+import os
+import time
+import numpy as np
+import soundfile as sf
+from utils.stt import transcribe_audio
+from utils.translation import translate_text
+from utils.tts import get_tts_engine, generate_speech
+# Initialize environment configurations
+os.makedirs("temp/uploads", exist_ok=True)
+os.makedirs("temp/outputs", exist_ok=True)
+# CSS for styling the Gradio interface
+css = """
+.gradio-container {
+    max-width: 1200px;
+    margin: 0 auto;
+}
+.output-text {
+    font-family: monospace;
+    padding: 10px;
+    background-color: #f5f5f5;
+    border-radius: 4px;
+}
+"""
+def handle_file_processing(audio_file):
+    """
+    Execute the complete processing pipeline:
+    1. Speech-to-Text (STT)
+    2. Machine Translation
+    3. Text-to-Speech (TTS)
+    Args:
+        audio_file: Tuple containing (sample_rate, audio_data)
+    Returns:
+        Tuple containing (english_text, chinese_text, output_audio)
+    """
+    logger.info("Starting processing for uploaded audio")
+    try:
+        # Save the uploaded audio to a temporary file
+        sr, audio_data = audio_file
+        temp_path = os.path.join("temp/uploads", f"upload_{time.time()}.wav")
+        sf.write(temp_path, audio_data, sr)
+        logger.info(f"Saved uploaded audio to {temp_path}")
+        # STT Phase
+        logger.info("Beginning STT processing")
+        english_text = transcribe_audio(temp_path)
+        logger.info(f"STT completed. Text length: {len(english_text)} characters")
+        # Translation Phase
+        logger.info("Beginning translation")
+        chinese_text = translate_text(english_text)
+        logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters")
+        # TTS Phase
+        logger.info("Beginning TTS generation")
+        # Initialize TTS engine with appropriate language code for Chinese
+        engine = get_tts_engine(lang_code='z')  # 'z' for Mandarin Chinese
+        # Generate speech and get the file path
+        output_path = engine.generate_speech(chinese_text, voice="zf_xiaobei")
+        logger.info(f"TTS completed. Output file: {output_path}")
+        # Load the generated audio for Gradio output
+        audio_data, sr = sf.read(output_path)
+        return english_text, chinese_text, (sr, audio_data)
+    except Exception as e:
+        logger.error(f"Processing failed: {str(e)}", exc_info=True)
+        raise gr.Error(f"Processing Failed: {str(e)}")
+def stream_audio(chinese_text, voice, speed):
+    """
+    Stream audio in chunks for the Gradio interface
+    Args:
+        chinese_text: The Chinese text to convert to speech
+        voice: The voice to use
+        speed: The speech speed factor
+    Returns:
+        Generator yielding audio chunks
+    """
+    engine = get_tts_engine(lang_code='z')
+    # Stream the audio in chunks
+    for sample_rate, audio_chunk in engine.generate_speech_stream(
+        chinese_text,
+        voice=voice,
+        speed=speed
+    ):
+        # Create a temporary file for each chunk
+        temp_chunk_path = f"temp/outputs/chunk_{time.time()}.wav"
+        sf.write(temp_chunk_path, audio_chunk, sample_rate)
+        # Load the chunk for Gradio output
+        chunk_data, sr = sf.read(temp_chunk_path)
+        # Clean up the temporary chunk file
+        os.remove(temp_chunk_path)
+        yield (sr, chunk_data)
+def create_interface():
+    """
+    Create and configure the Gradio interface
+    Returns:
+        Gradio Blocks interface
+    """
+    with gr.Blocks(css=css) as interface:
+        gr.Markdown("# 🎧 High-Quality Audio Translation System")
+        gr.Markdown("Upload English Audio → Get Chinese Speech Output")
+        with gr.Row():
+            with gr.Column(scale=2):
+                # File upload component
+                audio_input = gr.Audio(
+                    label="Upload English Audio",
+                    type="numpy",
+                    sources=["upload", "microphone"]
+                )
+                # Process button
+                process_btn = gr.Button("Process Audio", variant="primary")
+            with gr.Column(scale=1):
+                # TTS Settings
+                with gr.Box():
+                    gr.Markdown("### TTS Settings")
+                    voice_dropdown = gr.Dropdown(
+                        choices=["Xiaobei (Female)", "Yunjian (Male)"],
+                        value="Xiaobei (Female)",
+                        label="Select Voice"
+                    )
+                    speed_slider = gr.Slider(
+                        minimum=0.5,
+                        maximum=2.0,
+                        value=1.0,
+                        step=0.1,
+                        label="Speech Speed"
+                    )
+        # Output section
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Text outputs
+                english_output = gr.Textbox(
+                    label="Recognition Results",
+                    lines=5,
+                    elem_classes=["output-text"]
+                )
+                chinese_output = gr.Textbox(
+                    label="Translation Results",
+                    lines=5,
+                    elem_classes=["output-text"]
+                )
+            with gr.Column(scale=1):
+                # Audio output
+                audio_output = gr.Audio(
+                    label="Audio Output",
+                    type="numpy"
+                )
+                # Stream button
+                stream_btn = gr.Button("Stream Audio")
+                # Download button is automatically provided by gr.Audio
+        # Set up event handlers
+        process_btn.click(
+            fn=handle_file_processing,
+            inputs=[audio_input],
+            outputs=[english_output, chinese_output, audio_output]
+        )
+        # Map voice selection to actual voice IDs
+        def get_voice_id(voice_name):
+            voice_map = {
+                "Xiaobei (Female)": "zf_xiaobei",
+                "Yunjian (Male)": "zm_yunjian"
+            }
+            return voice_map.get(voice_name, "zf_xiaobei")
+        # Stream button handler
+        stream_btn.click(
+            fn=lambda text, voice, speed: stream_audio(text, get_voice_id(voice), speed),
+            inputs=[chinese_output, voice_dropdown, speed_slider],
+            outputs=audio_output
+        )
+        # Examples
+        gr.Examples(
+            examples=[
+                ["examples/sample1.mp3"],
+                ["examples/sample2.wav"]
+            ],
+            inputs=audio_input
+        )
+    return interface
+def main():
+    """
+    Main application entry point
+    """
+    logger.info("Starting Gradio application")
+    interface = create_interface()
+    interface.launch()
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -8,8 +8,11 @@ torchaudio>=2.1.0
 scipy>=1.11
 munch>=2.5
 accelerate>=1.2.0
-soundfile>=0.13.0
 kokoro>=0.9.4
 ordered-set>=4.1.0
 phonemizer-fork>=3.3.2
-descript-audio-codec

 scipy>=1.11
 munch>=2.5
 accelerate>=1.2.0
+soundfile>=0.13.1
 kokoro>=0.9.4
 ordered-set>=4.1.0
 phonemizer-fork>=3.3.2
+descript-audio-codec
+gradio>=5.25.2
+gradio-dialogue>=0.0.4
+huggingface-hub>=0.30.2

utils/tts_dia.py CHANGED Viewed

@@ -6,6 +6,7 @@ import numpy as np
 import soundfile as sf
 from pathlib import Path
 from typing import Optional
 from dia.model import Dia
@@ -64,7 +65,7 @@ def _get_model() -> Dia:
             raise
     return _model
 def generate_speech(text: str, language: str = "zh") -> str:
     """Public interface for TTS generation using Dia model

 import soundfile as sf
 from pathlib import Path
 from typing import Optional
+import spaces
 from dia.model import Dia
             raise
     return _model
+@spaces.GPU
 def generate_speech(text: str, language: str = "zh") -> str:
     """Public interface for TTS generation using Dia model