Spaces:

DroolingPanda
/

teachingAssistant

Running

App Files Files Community

Michael Hu commited on Jan 26

Commit

c72d839

1 Parent(s): f7102b4

add more logging

Browse files

Files changed (4) hide show

app.py +39 -10
utils/stt.py +51 -33
utils/translation.py +42 -29
utils/tts.py +42 -24

app.py CHANGED Viewed

@@ -3,6 +3,18 @@ Main entry point for the Audio Translation Web Application
 Handles file upload, processing pipeline, and UI rendering
 """
 import streamlit as st
 import os
 import time
@@ -14,12 +26,14 @@ from utils.tts_dummy import generate_speech
 # Hugging Face Spaces Setup Automation
 def setup_huggingface_space():
     """Automatically configure Hugging Face Space requirements"""
     st.sidebar.header("Space Configuration")
-    # Check for required system packages
     try:
         subprocess.run(["espeak-ng", "--version"], check=True, capture_output=True)
     except (FileNotFoundError, subprocess.CalledProcessError):
         st.sidebar.error("""
         **Missing System Dependencies!** Add this to your Space settings:
         ```txt
@@ -28,7 +42,6 @@ def setup_huggingface_space():
         """)
         st.stop()
-    # Verify model files
     model_dir = "./kokoro"
     required_files = [
         f"{model_dir}/kokoro-v0_19.pth",
@@ -36,6 +49,7 @@ def setup_huggingface_space():
     ]
     if not all(os.path.exists(f) for f in required_files):
         st.sidebar.warning("""
         **Missing Model Files!** Add this to your Space settings:
         ```txt
@@ -50,6 +64,7 @@ os.makedirs("temp/outputs", exist_ok=True)
 def configure_page():
     """Set up Streamlit page configuration"""
     st.set_page_config(
         page_title="Audio Translator",
         page_icon="🎧",
@@ -72,36 +87,51 @@ def handle_file_processing(upload_path):
     2. Machine Translation
     3. Text-to-Speech (TTS)
     """
     progress_bar = st.progress(0)
     status_text = st.empty()
     try:
         # STT Phase
         status_text.markdown("🔍 **Performing Speech Recognition...**")
-        english_text = transcribe_audio(upload_path)
         progress_bar.progress(30)
         # Translation Phase
         status_text.markdown("🌐 **Translating Content...**")
-        chinese_text = translate_text(english_text)
         progress_bar.progress(60)
         # TTS Phase
         status_text.markdown("🎵 **Generating Chinese Speech...**")
-        output_path = generate_speech(chinese_text, language="zh")
         progress_bar.progress(100)
         # Display results
         status_text.success("✅ Processing Complete!")
         return english_text, chinese_text, output_path
     except Exception as e:
         status_text.error(f"❌ Processing Failed: {str(e)}")
         st.exception(e)
         raise
 def render_results(english_text, chinese_text, output_path):
     """Display processing results in organized columns"""
     st.divider()
     col1, col2 = st.columns([2, 1])
@@ -125,12 +155,12 @@ def render_results(english_text, chinese_text, output_path):
 def main():
     """Main application workflow"""
     # setup_huggingface_space()  # First-run configuration checks
     configure_page()
     st.title("🎧 High-Quality Audio Translation System")
     st.markdown("Upload English Audio → Get Chinese Speech Output")
-    # File uploader widget
     uploaded_file = st.file_uploader(
         "Select Audio File (MP3/WAV)",
         type=["mp3", "wav"],
@@ -138,12 +168,11 @@ def main():
     )
     if uploaded_file:
-        # Save uploaded file
         upload_path = os.path.join("temp/uploads", uploaded_file.name)
         with open(upload_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
-        # Execute processing pipeline
         results = handle_file_processing(upload_path)
         if results:
             render_results(*results)

 Handles file upload, processing pipeline, and UI rendering
 """
+# Configure logging first
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler("app.log"),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
 import streamlit as st
 import os
 import time
 # Hugging Face Spaces Setup Automation
 def setup_huggingface_space():
     """Automatically configure Hugging Face Space requirements"""
+    logger.debug("Running Hugging Face space setup")
     st.sidebar.header("Space Configuration")
     try:
         subprocess.run(["espeak-ng", "--version"], check=True, capture_output=True)
+        logger.debug("espeak-ng verification successful")
     except (FileNotFoundError, subprocess.CalledProcessError):
+        logger.error("Missing espeak-ng dependency")
         st.sidebar.error("""
         **Missing System Dependencies!** Add this to your Space settings:
         ```txt
         """)
         st.stop()
     model_dir = "./kokoro"
     required_files = [
         f"{model_dir}/kokoro-v0_19.pth",
     ]
     if not all(os.path.exists(f) for f in required_files):
+        logger.error("Missing model files in %s", model_dir)
         st.sidebar.warning("""
         **Missing Model Files!** Add this to your Space settings:
         ```txt
 def configure_page():
     """Set up Streamlit page configuration"""
+    logger.debug("Configuring Streamlit page")
     st.set_page_config(
         page_title="Audio Translator",
         page_icon="🎧",
     2. Machine Translation
     3. Text-to-Speech (TTS)
     """
+    logger.info(f"Starting processing for: {upload_path}")
     progress_bar = st.progress(0)
     status_text = st.empty()
     try:
         # STT Phase
+        logger.debug("Beginning STT processing")
         status_text.markdown("🔍 **Performing Speech Recognition...**")
+        with st.spinner("Initializing Whisper model..."):
+            english_text = transcribe_audio(upload_path)
         progress_bar.progress(30)
+        logger.info(f"STT completed. Text length: {len(english_text)} characters")
         # Translation Phase
+        logger.debug("Beginning translation")
         status_text.markdown("🌐 **Translating Content...**")
+        with st.spinner("Loading translation model..."):
+            chinese_text = translate_text(english_text)
         progress_bar.progress(60)
+        logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters")
         # TTS Phase
+        logger.debug("Beginning TTS generation")
         status_text.markdown("🎵 **Generating Chinese Speech...**")
+        with st.spinner("Initializing TTS engine..."):
+            output_path = generate_speech(chinese_text, language="zh")
         progress_bar.progress(100)
+        logger.info(f"TTS completed. Output file: {output_path}")
+        # Display results
         # Display results
         status_text.success("✅ Processing Complete!")
         return english_text, chinese_text, output_path
     except Exception as e:
+        logger.error(f"Processing failed: {str(e)}", exc_info=True)
         status_text.error(f"❌ Processing Failed: {str(e)}")
         st.exception(e)
         raise
 def render_results(english_text, chinese_text, output_path):
     """Display processing results in organized columns"""
+    logger.debug("Rendering results")
     st.divider()
     col1, col2 = st.columns([2, 1])
 def main():
     """Main application workflow"""
+    logger.info("Starting application")
     # setup_huggingface_space()  # First-run configuration checks
     configure_page()
     st.title("🎧 High-Quality Audio Translation System")
     st.markdown("Upload English Audio → Get Chinese Speech Output")
     uploaded_file = st.file_uploader(
         "Select Audio File (MP3/WAV)",
         type=["mp3", "wav"],
     )
     if uploaded_file:
+        logger.info(f"File uploaded: {uploaded_file.name}")
         upload_path = os.path.join("temp/uploads", uploaded_file.name)
         with open(upload_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
         results = handle_file_processing(upload_path)
         if results:
             render_results(*results)

utils/stt.py CHANGED Viewed

@@ -3,6 +3,9 @@ Speech Recognition Module using Whisper Large-v3
 Handles audio preprocessing and transcription
 """
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 from pydub import AudioSegment
@@ -15,37 +18,52 @@ def transcribe_audio(audio_path):
     Returns:
         Transcribed English text
     """
-    # Configure hardware settings
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    # Convert to proper audio format
-    audio = AudioSegment.from_file(audio_path)
-    processed_audio = audio.set_frame_rate(16000).set_channels(1)
-    wav_path = audio_path.replace(".mp3", ".wav")
-    processed_audio.export(wav_path, format="wav")
-    # Initialize ASR model
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        "openai/whisper-large-v3",
-        torch_dtype=torch.float32,
-        low_cpu_mem_usage=True,
-        use_safetensors=True
-    ).to(device)
-    processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
-    # Process audio input
-    inputs = processor(
-        wav_path,
-        sampling_rate=16000,
-        return_tensors="pt",
-        truncation=True,
-        chunk_length_s=30,
-        stride_length_s=5
-    ).to(device)
-    # Generate transcription
-    with torch.no_grad():
-        outputs = model.generate(**inputs, language="en", task="transcribe")
-    return processor.batch_decode(outputs, skip_special_tokens=True)[0]

 Handles audio preprocessing and transcription
 """
+import logging
+logger = logging.getLogger(__name__)
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 from pydub import AudioSegment
     Returns:
         Transcribed English text
     """
+    logger.info(f"Starting transcription for: {audio_path}")
+    try:
+        # Audio conversion
+        logger.debug("Converting audio format")
+        audio = AudioSegment.from_file(audio_path)
+        processed_audio = audio.set_frame_rate(16000).set_channels(1)
+        wav_path = audio_path.replace(".mp3", ".wav")
+        processed_audio.export(wav_path, format="wav")
+        logger.debug(f"Audio converted to: {wav_path}")
+        # Model initialization
+        logger.info("Loading Whisper model")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.debug(f"Using device: {device}")
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            "openai/whisper-large-v3",
+            torch_dtype=torch.float32,
+            low_cpu_mem_usage=True,
+            use_safetensors=True
+        ).to(device)
+        processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
+        logger.debug("Model loaded successfully")
+        # Processing
+        logger.debug("Processing audio input")
+        inputs = processor(
+            wav_path,
+            sampling_rate=16000,
+            return_tensors="pt",
+            truncation=True,
+            chunk_length_s=30,
+            stride_length_s=5
+        ).to(device)
+        # Transcription
+        logger.info("Generating transcription")
+        with torch.no_grad():
+            outputs = model.generate(**inputs, language="en", task="transcribe")
+        result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        logger.info(f"Transcription completed successfully")
+        return result
+    except Exception as e:
+        logger.error(f"Transcription failed: {str(e)}", exc_info=True)
+        raise

utils/translation.py CHANGED Viewed

@@ -3,6 +3,9 @@ Text Translation Module using NLLB-3.3B model
 Handles text segmentation and batch translation
 """
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 def translate_text(text):
@@ -13,33 +16,43 @@ def translate_text(text):
     Returns:
         Translated Chinese text
     """
-    # Initialize translation model
-    tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B")
-    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
-    # Split long text into manageable chunks
-    max_chunk_length = 1000
-    text_chunks = [
-        text[i:i+max_chunk_length]
-        for i in range(0, len(text), max_chunk_length)
-    ]
-    translated_chunks = []
-    for chunk in text_chunks:
-        # Prepare model inputs
-        inputs = tokenizer(
-            chunk,
-            return_tensors="pt",
-            max_length=1024,
-            truncation=True
-        )
-        # Generate translation
-        outputs = model.generate(
-            **inputs,
-            forced_bos_token_id=tokenizer.lang_code_to_id["zho_Hans"],
-            max_new_tokens=1024
-        )
-        translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
-    return "".join(translated_chunks)

 Handles text segmentation and batch translation
 """
+import logging
+logger = logging.getLogger(__name__)
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 def translate_text(text):
     Returns:
         Translated Chinese text
     """
+    logger.info(f"Starting translation for text length: {len(text)}")
+    try:
+        # Model initialization
+        logger.info("Loading NLLB model")
+        tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B")
+        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
+        logger.debug("Translation model loaded")
+        # Text processing
+        max_chunk_length = 1000
+        text_chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
+        logger.info(f"Split text into {len(text_chunks)} chunks")
+        translated_chunks = []
+        for i, chunk in enumerate(text_chunks):
+            logger.debug(f"Processing chunk {i+1}/{len(text_chunks)}")
+            inputs = tokenizer(
+                chunk,
+                return_tensors="pt",
+                max_length=1024,
+                truncation=True
+            )
+            outputs = model.generate(
+                **inputs,
+                forced_bos_token_id=tokenizer.lang_code_to_id["zho_Hans"],
+                max_new_tokens=1024
+            )
+            translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            translated_chunks.append(translated)
+            logger.debug(f"Chunk {i+1} translated successfully")
+        result = "".join(translated_chunks)
+        logger.info(f"Translation completed. Total length: {len(result)}")
+        return result
+    except Exception as e:
+        logger.error(f"Translation failed: {str(e)}", exc_info=True)
+        raise

utils/tts.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import os
 import torch
 import time
 from pydub import AudioSegment
 from phonemizer.backend.espeak.wrapper import EspeakWrapper
 from models import build_model
 # Hugging Face Spaces setup
 MODEL_DIR = "./kokoro"
 os.makedirs(MODEL_DIR, exist_ok=True)
@@ -14,12 +17,17 @@ EspeakWrapper.set_library('/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1')
 class TTSEngine:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self._verify_model_files()
         self.model = build_model(f"{MODEL_DIR}/kokoro-v0_19.pth", self.device)
         self.voice = torch.load(f"{MODEL_DIR}/voices/af_bella.pt",
                               map_location=self.device)
     def _verify_model_files(self):
         """Ensure required model files exist"""
         required_files = [
@@ -29,6 +37,7 @@ class TTSEngine:
         missing = [f for f in required_files if not os.path.exists(f)]
         if missing:
             raise FileNotFoundError(
                 f"Missing model files: {missing}\n"
                 "Add this to your Hugging Face Space settings:\n"
@@ -38,30 +47,39 @@ class TTSEngine:
     def generate_speech(self, text: str, language: str = "zh") -> str:
         """Generate speech from Chinese text"""
-        from kokoro import generate_full
-        # Safety checks for Hugging Face Free Tier
-        if len(text) > 500:
-            text = text[:495] + "[TRUNCATED]"
-        audio, _ = generate_full(
-            self.model,
-            text,
-            self.voice,
-            lang='en-us',
-            max_len=200 if self.device == "cpu" else 500
-        )
-        # Save output
-        output_path = f"temp/outputs/output_{int(time.time())}.wav"
-        AudioSegment(
-            audio.numpy().tobytes(),
-            frame_rate=24000,
-            sample_width=2,
-            channels=1
-        ).export(output_path, format="wav")
-        return output_path
 # Initialize TTS engine once
 @st.cache_resource

 import os
 import torch
 import time
+import logging
 from pydub import AudioSegment
 from phonemizer.backend.espeak.wrapper import EspeakWrapper
 from models import build_model
+logger = logging.getLogger(__name__)
 # Hugging Face Spaces setup
 MODEL_DIR = "./kokoro"
 os.makedirs(MODEL_DIR, exist_ok=True)
 class TTSEngine:
     def __init__(self):
+        logger.info("Initializing TTS Engine")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.debug(f"Using device: {self.device}")
         self._verify_model_files()
+        logger.info("Loading Kokoro model")
         self.model = build_model(f"{MODEL_DIR}/kokoro-v0_19.pth", self.device)
+        logger.info("Loading voice model")
         self.voice = torch.load(f"{MODEL_DIR}/voices/af_bella.pt",
                               map_location=self.device)
+        logger.info("TTS engine initialized")
     def _verify_model_files(self):
         """Ensure required model files exist"""
         required_files = [
         missing = [f for f in required_files if not os.path.exists(f)]
         if missing:
+            logger.error(f"Missing model files: {missing}")
             raise FileNotFoundError(
                 f"Missing model files: {missing}\n"
                 "Add this to your Hugging Face Space settings:\n"
     def generate_speech(self, text: str, language: str = "zh") -> str:
         """Generate speech from Chinese text"""
+        logger.info(f"Generating speech for text length: {len(text)}")
+        try:
+            from kokoro import generate_full
+            if len(text) > 500:
+                logger.warning(f"Truncating long text ({len(text)} characters)")
+                text = text[:495] + "[TRUNCATED]"
+            logger.debug("Starting audio generation")
+            audio, _ = generate_full(
+                self.model,
+                text,
+                self.voice,
+                lang='en-us',
+                max_len=200 if self.device == "cpu" else 500
+            )
+            output_path = f"temp/outputs/output_{int(time.time())}.wav"
+            logger.debug(f"Saving audio to {output_path}")
+            AudioSegment(
+                audio.numpy().tobytes(),
+                frame_rate=24000,
+                sample_width=2,
+                channels=1
+            ).export(output_path, format="wav")
+            logger.info(f"Audio generation complete: {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"TTS generation failed: {str(e)}", exc_info=True)
+            raise
 # Initialize TTS engine once
 @st.cache_resource