Spaces:

Jerich
/

TalklasApp2

Sleeping

App Files Files Community

Jerich commited on 27 days ago

Commit

0ec414d

verified ·

1 Parent(s): dec41a9

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -66

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTa
 from fastapi.responses import JSONResponse
 from fastapi.staticfiles import StaticFiles
 from typing import Dict, Any, Optional, Tuple, List
-from(datetime import datetime, timedelta)
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -35,7 +35,8 @@ models_loaded = False
 loading_in_progress = False
 loading_thread = None
 model_status = {
-    "stt": "not_loaded",
     "mt": "not_loaded",
     "tts": "not_loaded"
 }
@@ -43,10 +44,10 @@ error_message = None
 current_tts_language = "tgl"  # Track the current TTS language
 # Model instances
-stt_processor_whisper = None
-stt_model_whisper = None
-stt_processor_mms = None
-stt_model_mms = None
 mt_model = None
 mt_tokenizer = None
 tts_model = None
@@ -62,11 +63,9 @@ LANGUAGE_MAPPING = {
     "Pangasinan": "pag"
 }
-# Mapping for Whisper language names
-WHISPER_LANGUAGE_MAPPING = {
-    "eng": "english",
-    "tgl": "tagalog"
-}
 NLLB_LANGUAGE_CODES = {
     "eng": "eng_Latn",
@@ -93,39 +92,60 @@ def check_inappropriate_content(text: str) -> bool:
     Check if the text contains inappropriate content.
     Returns True if inappropriate content is detected, False otherwise.
     """
     text_lower = text.lower()
     for word in INAPPROPRIATE_WORDS:
         pattern = r'\b' + re.escape(word) + r'\b'
         if re.search(pattern, text_lower):
             logger.warning(f"Inappropriate content detected: {word}")
             return True
     return False
 # Function to save PCM data as a WAV file
 def save_pcm_to_wav(pcm_data: list, sample_rate: int, output_path: str):
     pcm_array = np.array(pcm_data, dtype=np.int16)
     with wave.open(output_path, 'wb') as wav_file:
         wav_file.setnchannels(1)
-        wav_file.setsampwidth(2)
         wav_file.setframerate(sample_rate)
         wav_file.writeframes(pcm_array.tobytes())
 # Function to detect speech using an energy-based approach
 def detect_speech(waveform: torch.Tensor, sample_rate: int, threshold: float = 0.01, min_speech_duration: float = 0.5) -> bool:
     waveform_np = waveform.numpy()
     if waveform_np.ndim > 1:
-        waveform_np = waveform_np.mean(axis=0)
     rms = np.sqrt(np.mean(waveform_np**2))
     logger.info(f"RMS energy: {rms}")
     if rms < threshold:
         logger.info("No speech detected: RMS energy below threshold")
         return False
     return True
 # Function to clean up old audio files
 def cleanup_old_audio_files():
     logger.info("Starting cleanup of old audio files...")
-    expiration_time = datetime.now() - timedelta(minutes=10)
     for filename in os.listdir(AUDIO_DIR):
         file_path = os.path.join(AUDIO_DIR, filename)
         if os.path.isfile(file_path):
@@ -141,49 +161,53 @@ def cleanup_old_audio_files():
 def schedule_cleanup():
     while True:
         cleanup_old_audio_files()
-        time.sleep(300)
 # Function to load models in background
 def load_models_task():
     global models_loaded, loading_in_progress, model_status, error_message
-    global stt_processor_whisper, stt_model_whisper, stt_processor_mms, stt_model_mms
     global mt_model, mt_tokenizer, tts_model, tts_tokenizer
     try:
         loading_in_progress = True
-        # Load STT models
-        logger.info("Starting to load STT models...")
-        from transformers import AutoProcessor, AutoModelForCTC, WhisperProcessor, WhisperForConditionalGeneration
         try:
             logger.info("Loading Whisper STT model...")
-            model_status["stt"] = "loading"
-            stt_processor_whisper = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-            stt_model_whisper = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            stt_model_whisper.to(device)
             logger.info("Whisper STT model loaded successfully")
-            model_status["stt"] = "loaded_whisper"
-        except Exception as e:
-            logger.error(f"Failed to load Whisper STT model: {str(e)}")
-            model_status["stt"] = "failed"
-            error_message = f"Whisper STT model loading failed: {str(e)}"
             return
         try:
             logger.info("Loading MMS STT model...")
-            stt_processor_mms = AutoProcessor.from_pretrained("facebook/mms-1b-all")
-            stt_model_mms = AutoModelForCTC.from_pretrained("facebook/mms-1b-all")
-            stt_model_mms.to(device)
             logger.info("MMS STT model loaded successfully")
-            model_status["stt"] = "loaded_both" if model_status["stt"] == "loaded_whisper" else "loaded_mms"
-        except Exception as e:
-            logger.error(f"Failed to load MMS STT model: {str(e)}")
-            if model_status["stt"] != "loaded_whisper":
-                model_status["stt"] = "failed"
-                error_message = f"MMS STT model loading failed: {str(e)}"
-                return
         # Load MT model
         logger.info("Starting to load MT model...")
@@ -203,7 +227,7 @@ def load_models_task():
             error_message = f"MT model loading failed: {str(e)}"
             return
-        # Load TTS model (default to Tagalog)
         logger.info("Starting to load TTS model...")
         from transformers import VitsModel, AutoTokenizer
@@ -217,6 +241,7 @@ def load_models_task():
             model_status["tts"] = "loaded"
         except Exception as e:
             logger.error(f"Failed to load TTS model for Tagalog: {str(e)}")
             try:
                 logger.info("Falling back to MMS-TTS English model...")
                 tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
@@ -257,13 +282,21 @@ def start_cleanup_task():
 # Function to load or update TTS model for a specific language
 def load_tts_model_for_language(target_code: str) -> bool:
     global tts_model, tts_tokenizer, current_tts_language, model_status
     if target_code not in LANGUAGE_MAPPING.values():
         logger.error(f"Invalid language code: {target_code}")
         return False
     if current_tts_language == target_code and model_status["tts"].startswith("loaded"):
         logger.info(f"TTS model for {target_code} is already loaded.")
         return True
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
         logger.info(f"Loading MMS-TTS model for {target_code}...")
@@ -293,21 +326,32 @@ def load_tts_model_for_language(target_code: str) -> bool:
 # Function to synthesize speech from text
 def synthesize_speech(text: str, target_code: str) -> Tuple[Optional[str], Optional[str]]:
     global tts_model, tts_tokenizer
     request_id = str(uuid.uuid4())
     output_path = os.path.join(AUDIO_DIR, f"{request_id}.wav")
     if not load_tts_model_for_language(target_code):
         return None, "Failed to load TTS model for the target language"
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
-        inputs = tts_tokenizer(text, return_tensors="pt").toagli(device)
         with torch.no_grad():
             output = tts_model(**inputs)
         speech = output.waveform.cpu().numpy().squeeze()
         speech = (speech * 32767).astype(np.int16)
         sample_rate = tts_model.config.sampling_rate
         save_pcm_to_wav(speech.tolist(), sample_rate, output_path)
         logger.info(f"Saved synthesized audio to {output_path}")
         return output_path, None
     except Exception as e:
         error_msg = f"Error during TTS conversion: {str(e)}"
@@ -323,11 +367,14 @@ async def startup_event():
 @app.get("/")
 async def root():
     logger.info("Root endpoint requested")
     return {"status": "healthy"}
 @app.get("/health")
 async def health_check():
     logger.info("Health check requested")
     return {
         "status": "healthy",
@@ -339,16 +386,22 @@ async def health_check():
 @app.post("/translate-text")
 async def translate_text(text: str = Form(...), source_lang: str = Form(...), target_lang: str = Form(...)):
     global mt_model, mt_tokenizer, tts_model, tts_tokenizer, current_tts_language
     if not text:
         raise HTTPException(status_code=400, detail="No text provided")
     if source_lang not in LANGUAGE_MAPPING or target_lang not in LANGUAGE_MAPPING:
         raise HTTPException(status_code=400, detail="Invalid language selected")
     logger.info(f"Translate-text requested: {text} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
     source_code = LANGUAGE_MAPPING[source_lang]
     target_code = LANGUAGE_MAPPING[target_lang]
     translated_text = "Translation not available"
     if model_status["mt"] == "loaded" and mt_model is not None and mt_tokenizer is not None:
         try:
             source_nllb_code = NLLB_LANGUAGE_CODES[source_code]
@@ -369,20 +422,26 @@ async def translate_text(text: str = Form(...), source_lang: str = Form(...), ta
             translated_text = f"Translation failed: {str(e)}"
     else:
         logger.warning("MT model not loaded, skipping translation")
     is_inappropriate = check_inappropriate_content(text) or check_inappropriate_content(translated_text)
     if is_inappropriate:
         logger.warning("Inappropriate content detected in translation request")
     output_audio_url = None
     if model_status["tts"].startswith("loaded"):
         if load_tts_model_for_language(target_code):
             try:
                 output_path, error = synthesize_speech(translated_text, target_code)
                 if output_path:
                     output_filename = os.path.basename(output_path)
-                    output_audio_url = f"https://jerich-talklasapp2.hf.space/audio_output/{output_filename}"
                     logger.info("TTS conversion completed")
             except Exception as e:
                 logger.error(f"Error during TTS conversion: {str(e)}")
     return {
         "request_id": request_id,
         "status": "completed",
@@ -395,7 +454,8 @@ async def translate_text(text: str = Form(...), source_lang: str = Form(...), ta
 @app.post("/translate-audio")
 async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form(...), target_lang: str = Form(...)):
-    global stt_processor_whisper, stt_model_whisper, stt_processor_mms, stt_model_mms
     global mt_model, mt_tokenizer, tts_model, tts_tokenizer, current_tts_language
     if not audio:
@@ -403,15 +463,19 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
     if source_lang not in LANGUAGE_MAPPING or target_lang not in LANGUAGE_MAPPING:
         raise HTTPException(status_code=400, detail="Invalid language selected")
-    logger.info(f"Translate-audio requested: {audio.filename} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
-    source_code = LANGUAGE_MAPPING[source_lang]
-    use_whisper = source_code in ["eng", "tgl"]
-    # Check if appropriate STT model is loaded
-    if use_whisper and (stt_processor_whisper is None or stt_model_whisper is None):
-        logger.warning("Whisper STT model not loaded, returning placeholder response")
         return {
             "request_id": request_id,
             "status": "processing",
@@ -421,8 +485,9 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
             "output_audio": None,
             "is_inappropriate": False
         }
-    elif not use_whisper and (stt_processor_mms is None or stt_model_mms is None):
-        logger.warning("MMS STT model not loaded, returning placeholder response")
         return {
             "request_id": request_id,
             "status": "processing",
@@ -433,6 +498,7 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
             "is_inappropriate": False
         }
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
         temp_file.write(await audio.read())
         temp_path = temp_file.name
@@ -443,16 +509,19 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
     is_inappropriate = False
     try:
         logger.info(f"Reading audio file: {temp_path}")
         waveform, sample_rate = torchaudio.load(temp_path)
         logger.info(f"Audio loaded: sample_rate={sample_rate}, waveform_shape={waveform.shape}")
         if sample_rate != 16000:
             logger.info(f"Resampling audio from {sample_rate} Hz to 16000 Hz")
             resampler = torchaudio.transforms.Resample(sample_rate, 16000)
             waveform = resampler(waveform)
             sample_rate = 16000
         if not detect_speech(waveform, sample_rate):
             return {
                 "request_id": request_id,
@@ -464,26 +533,45 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
                 "is_inappropriate": False
             }
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        logger.info(f"Using device: {device}")
         if use_whisper:
-            logger.info("Using Whisper model for transcription")
-            whisper_lang = WHISPER_LANGUAGE_MAPPING.get(source_code, "english")  # Default to English if not mapped
-            inputs = stt_processor_whisper(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
             with torch.no_grad():
-                generated_ids = stt_model_whisper.generate(**inputs, language=whisper_lang)
-                transcription = stt_processor_whisper.batch_decode(generated_ids, skip_special_tokens=True)[0]
         else:
-            logger.info("Using MMS model for transcription")
-            inputs = stt_processor_mms(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
             with torch.no_grad():
-                logits = stt_model_mms(**inputs).logits
                 predicted_ids = torch.argmax(logits, dim=-1)
-                transcription = stt_processor_mms.batch_decode(predicted_ids)[0]
         logger.info(f"Transcription completed: {transcription}")
-        target_code = LANGUAGE_MAPPING[target_lang]
         if model_status["mt"] == "loaded" and mt_model is not None and mt_tokenizer is not None:
             try:
                 source_nllb_code = NLLB_LANGUAGE_CODES[source_code]
@@ -504,16 +592,18 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
         else:
             logger.warning("MT model not loaded, skipping translation")
         is_inappropriate = check_inappropriate_content(transcription) or check_inappropriate_content(translated_text)
         if is_inappropriate:
             logger.warning("Inappropriate content detected in audio transcription or translation")
         if load_tts_model_for_language(target_code):
             try:
                 output_path, error = synthesize_speech(translated_text, target_code)
                 if output_path:
                     output_filename = os.path.basename(output_path)
-                    output_audio_url = f"https://jerich-talklasapp2.hf.space/audio_output/{output_filename}"
                     logger.info("TTS conversion completed")
             except Exception as e:
                 logger.error(f"Error during TTS conversion: {str(e)}")
@@ -544,6 +634,7 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
 @app.post("/text-to-speech")
 async def text_to_speech(text: str = Form(...), target_lang: str = Form(...)):
     if not text:
         raise HTTPException(status_code=400, detail="No text provided")
     if target_lang not in LANGUAGE_MAPPING:
@@ -553,17 +644,20 @@ async def text_to_speech(text: str = Form(...), target_lang: str = Form(...)):
     request_id = str(uuid.uuid4())
     target_code = LANGUAGE_MAPPING[target_lang]
     is_inappropriate = check_inappropriate_content(text)
     if is_inappropriate:
         logger.warning("Inappropriate content detected in text-to-speech request")
     output_audio_url = None
     if model_status["tts"].startswith("loaded") or load_tts_model_for_language(target_code):
         try:
             output_path, error = synthesize_speech(text, target_code)
             if output_path:
                 output_filename = os.path.basename(output_path)
-                output_audio_url = f"https://jerich-talklasapp2.hf.space/audio_output/{output_filename}"
                 logger.info("TTS conversion completed")
             else:
                 logger.error(f"TTS conversion failed: {error}")

 from fastapi.responses import JSONResponse
 from fastapi.staticfiles import StaticFiles
 from typing import Dict, Any, Optional, Tuple, List
+from datetime import datetime, timedelta
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 loading_in_progress = False
 loading_thread = None
 model_status = {
+    "stt_whisper": "not_loaded",
+    "stt_mms": "not_loaded",
     "mt": "not_loaded",
     "tts": "not_loaded"
 }
 current_tts_language = "tgl"  # Track the current TTS language
 # Model instances
+whisper_processor = None
+whisper_model = None
+mms_processor = None
+mms_model = None
 mt_model = None
 mt_tokenizer = None
 tts_model = None
     "Pangasinan": "pag"
 }
+# Define which languages use Whisper vs MMS for STT
+WHISPER_LANGUAGES = {"eng", "tgl"}  # English and Tagalog use Whisper
+MMS_LANGUAGES = {"ceb", "ilo", "war", "pag"}  # Other Philippine languages use MMS
 NLLB_LANGUAGE_CODES = {
     "eng": "eng_Latn",
     Check if the text contains inappropriate content.
     Returns True if inappropriate content is detected, False otherwise.
     """
+    # Convert to lowercase for case-insensitive matching
     text_lower = text.lower()
+    # Check for inappropriate words
     for word in INAPPROPRIATE_WORDS:
+        # Use word boundary matching to avoid false positives
         pattern = r'\b' + re.escape(word) + r'\b'
         if re.search(pattern, text_lower):
             logger.warning(f"Inappropriate content detected: {word}")
             return True
     return False
 # Function to save PCM data as a WAV file
 def save_pcm_to_wav(pcm_data: list, sample_rate: int, output_path: str):
+    # Convert pcm_data to a NumPy array of 16-bit integers
     pcm_array = np.array(pcm_data, dtype=np.int16)
     with wave.open(output_path, 'wb') as wav_file:
+        # Set WAV parameters: 1 channel (mono), 2 bytes per sample (16-bit), sample rate
         wav_file.setnchannels(1)
+        wav_file.setsampwidth(2)  # 16-bit audio
         wav_file.setframerate(sample_rate)
+        # Write the 16-bit PCM data as bytes (little-endian)
         wav_file.writeframes(pcm_array.tobytes())
 # Function to detect speech using an energy-based approach
 def detect_speech(waveform: torch.Tensor, sample_rate: int, threshold: float = 0.01, min_speech_duration: float = 0.5) -> bool:
+    """
+    Detects if the audio contains speech using an energy-based approach.
+    Returns True if speech is detected, False otherwise.
+    """
+    # Convert waveform to numpy array
     waveform_np = waveform.numpy()
     if waveform_np.ndim > 1:
+        waveform_np = waveform_np.mean(axis=0)  # Convert stereo to mono
+    # Compute RMS energy
     rms = np.sqrt(np.mean(waveform_np**2))
     logger.info(f"RMS energy: {rms}")
+    # Check if RMS energy exceeds the threshold
     if rms < threshold:
         logger.info("No speech detected: RMS energy below threshold")
         return False
+    # Optionally, check for minimum speech duration (requires more sophisticated VAD)
+    # For now, we assume if RMS is above threshold, there is speech
     return True
 # Function to clean up old audio files
 def cleanup_old_audio_files():
     logger.info("Starting cleanup of old audio files...")
+    expiration_time = datetime.now() - timedelta(minutes=10)  # Files older than 10 minutes
     for filename in os.listdir(AUDIO_DIR):
         file_path = os.path.join(AUDIO_DIR, filename)
         if os.path.isfile(file_path):
 def schedule_cleanup():
     while True:
         cleanup_old_audio_files()
+        time.sleep(300)  # Run every 5 minutes (300 seconds)
 # Function to load models in background
 def load_models_task():
     global models_loaded, loading_in_progress, model_status, error_message
+    global whisper_processor, whisper_model, mms_processor, mms_model
     global mt_model, mt_tokenizer, tts_model, tts_tokenizer
     try:
         loading_in_progress = True
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Load Whisper STT model for English and Tagalog
+        logger.info("Starting to load Whisper STT model...")
+        from transformers import WhisperProcessor, WhisperForConditionalGeneration
         try:
             logger.info("Loading Whisper STT model...")
+            model_status["stt_whisper"] = "loading"
+            whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
+            whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
+            whisper_model.to(device)
             logger.info("Whisper STT model loaded successfully")
+            model_status["stt_whisper"] = "loaded"
+        except Exception as whisper_error:
+            logger.error(f"Failed to load Whisper STT model: {str(whisper_error)}")
+            model_status["stt_whisper"] = "failed"
+            error_message = f"Whisper STT model loading failed: {str(whisper_error)}"
             return
+        # Load MMS STT model for other Philippine languages
+        logger.info("Starting to load MMS STT model...")
+        from transformers import AutoProcessor, AutoModelForCTC
         try:
             logger.info("Loading MMS STT model...")
+            model_status["stt_mms"] = "loading"
+            mms_processor = AutoProcessor.from_pretrained("facebook/mms-1b-all")
+            mms_model = AutoModelForCTC.from_pretrained("facebook/mms-1b-all")
+            mms_model.to(device)
             logger.info("MMS STT model loaded successfully")
+            model_status["stt_mms"] = "loaded"
+        except Exception as mms_error:
+            logger.error(f"Failed to load MMS STT model: {str(mms_error)}")
+            model_status["stt_mms"] = "failed"
+            error_message = f"MMS STT model loading failed: {str(mms_error)}"
+            return
         # Load MT model
         logger.info("Starting to load MT model...")
             error_message = f"MT model loading failed: {str(e)}"
             return
+        # Load TTS model (default to Tagalog, will be updated dynamically)
         logger.info("Starting to load TTS model...")
         from transformers import VitsModel, AutoTokenizer
             model_status["tts"] = "loaded"
         except Exception as e:
             logger.error(f"Failed to load TTS model for Tagalog: {str(e)}")
+            # Fallback to English TTS if the target language fails
             try:
                 logger.info("Falling back to MMS-TTS English model...")
                 tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
 # Function to load or update TTS model for a specific language
 def load_tts_model_for_language(target_code: str) -> bool:
+    """
+    Load or update the TTS model for the specified language.
+    Returns True if successful, False otherwise.
+    """
     global tts_model, tts_tokenizer, current_tts_language, model_status
     if target_code not in LANGUAGE_MAPPING.values():
         logger.error(f"Invalid language code: {target_code}")
         return False
+    # Skip if the model is already loaded for the target language
     if current_tts_language == target_code and model_status["tts"].startswith("loaded"):
         logger.info(f"TTS model for {target_code} is already loaded.")
         return True
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
         logger.info(f"Loading MMS-TTS model for {target_code}...")
 # Function to synthesize speech from text
 def synthesize_speech(text: str, target_code: str) -> Tuple[Optional[str], Optional[str]]:
+    """
+    Convert text to speech for the specified language.
+    Returns a tuple of (output_path, error_message).
+    """
     global tts_model, tts_tokenizer
     request_id = str(uuid.uuid4())
     output_path = os.path.join(AUDIO_DIR, f"{request_id}.wav")
+    # Make sure the TTS model is loaded for the target language
     if not load_tts_model_for_language(target_code):
         return None, "Failed to load TTS model for the target language"
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
+        inputs = tts_tokenizer(text, return_tensors="pt").to(device)
         with torch.no_grad():
             output = tts_model(**inputs)
         speech = output.waveform.cpu().numpy().squeeze()
         speech = (speech * 32767).astype(np.int16)
         sample_rate = tts_model.config.sampling_rate
+        # Save the audio as a WAV file
         save_pcm_to_wav(speech.tolist(), sample_rate, output_path)
         logger.info(f"Saved synthesized audio to {output_path}")
         return output_path, None
     except Exception as e:
         error_msg = f"Error during TTS conversion: {str(e)}"
 @app.get("/")
 async def root():
+    """Root endpoint for default health check"""
     logger.info("Root endpoint requested")
     return {"status": "healthy"}
 @app.get("/health")
 async def health_check():
+    """Health check endpoint that always returns successfully"""
+    global models_loaded, loading_in_progress, model_status, error_message
     logger.info("Health check requested")
     return {
         "status": "healthy",
 @app.post("/translate-text")
 async def translate_text(text: str = Form(...), source_lang: str = Form(...), target_lang: str = Form(...)):
+    """Endpoint to translate text and convert to speech"""
     global mt_model, mt_tokenizer, tts_model, tts_tokenizer, current_tts_language
     if not text:
         raise HTTPException(status_code=400, detail="No text provided")
     if source_lang not in LANGUAGE_MAPPING or target_lang not in LANGUAGE_MAPPING:
         raise HTTPException(status_code=400, detail="Invalid language selected")
     logger.info(f"Translate-text requested: {text} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
+    # Translate the text
     source_code = LANGUAGE_MAPPING[source_lang]
     target_code = LANGUAGE_MAPPING[target_lang]
     translated_text = "Translation not available"
     if model_status["mt"] == "loaded" and mt_model is not None and mt_tokenizer is not None:
         try:
             source_nllb_code = NLLB_LANGUAGE_CODES[source_code]
             translated_text = f"Translation failed: {str(e)}"
     else:
         logger.warning("MT model not loaded, skipping translation")
+    # Check for inappropriate content in the source text and translated text
     is_inappropriate = check_inappropriate_content(text) or check_inappropriate_content(translated_text)
     if is_inappropriate:
         logger.warning("Inappropriate content detected in translation request")
+    # Convert translated text to speech
     output_audio_url = None
     if model_status["tts"].startswith("loaded"):
+        # Load or update TTS model for the target language
         if load_tts_model_for_language(target_code):
             try:
                 output_path, error = synthesize_speech(translated_text, target_code)
                 if output_path:
                     output_filename = os.path.basename(output_path)
+                    output_audio_url = f"https://jerich-talklasapp.hf.space/audio_output/{output_filename}"
                     logger.info("TTS conversion completed")
             except Exception as e:
                 logger.error(f"Error during TTS conversion: {str(e)}")
     return {
         "request_id": request_id,
         "status": "completed",
 @app.post("/translate-audio")
 async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form(...), target_lang: str = Form(...)):
+    """Endpoint to transcribe, translate, and convert audio to speech"""
+    global whisper_processor, whisper_model, mms_processor, mms_model
     global mt_model, mt_tokenizer, tts_model, tts_tokenizer, current_tts_language
     if not audio:
     if source_lang not in LANGUAGE_MAPPING or target_lang not in LANGUAGE_MAPPING:
         raise HTTPException(status_code=400, detail="Invalid language selected")
+    source_code = LANGUAGE_MAPPING[source_lang]
+    target_code = LANGUAGE_MAPPING[target_lang]
+    logger.info(f"Translate-audio requested: {audio.filename} from {source_lang} ({source_code}) to {target_lang} ({target_code})")
     request_id = str(uuid.uuid4())
+    # Determine which STT model to use based on source language
+    use_whisper = source_code in WHISPER_LANGUAGES
+    use_mms = source_code in MMS_LANGUAGES
+    # Check if the appropriate STT model is loaded
+    if use_whisper and (model_status["stt_whisper"] != "loaded" or whisper_processor is None or whisper_model is None):
+        logger.warning("Whisper STT model not loaded for English/Tagalog, returning placeholder response")
         return {
             "request_id": request_id,
             "status": "processing",
             "output_audio": None,
             "is_inappropriate": False
         }
+    if use_mms and (model_status["stt_mms"] != "loaded" or mms_processor is None or mms_model is None):
+        logger.warning("MMS STT model not loaded for Philippine languages, returning placeholder response")
         return {
             "request_id": request_id,
             "status": "processing",
             "is_inappropriate": False
         }
+    # Save the uploaded audio to a temporary file
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
         temp_file.write(await audio.read())
         temp_path = temp_file.name
     is_inappropriate = False
     try:
+        # Step 1: Load and resample the audio using torchaudio
         logger.info(f"Reading audio file: {temp_path}")
         waveform, sample_rate = torchaudio.load(temp_path)
         logger.info(f"Audio loaded: sample_rate={sample_rate}, waveform_shape={waveform.shape}")
+        # Resample to 16 kHz if needed (required by Whisper and MMS models)
         if sample_rate != 16000:
             logger.info(f"Resampling audio from {sample_rate} Hz to 16000 Hz")
             resampler = torchaudio.transforms.Resample(sample_rate, 16000)
             waveform = resampler(waveform)
             sample_rate = 16000
+        # Step 2: Detect speech
         if not detect_speech(waveform, sample_rate):
             return {
                 "request_id": request_id,
                 "is_inappropriate": False
             }
+        # Step 3: Transcribe the audio (STT)
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Using device: {device} for STT")
         if use_whisper:
+            # Use Whisper model for English and Tagalog
+            logger.info(f"Using Whisper model for language: {source_code}")
+            # Prepare audio for Whisper
+            inputs = whisper_processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
+            logger.info("Audio processed for Whisper, generating transcription...")
             with torch.no_grad():
+                # For English, we can specify the language; for Tagalog we use 'tl'
+                forced_language = "en" if source_code == "eng" else "tl"
+                generated_ids = whisper_model.generate(
+                    **inputs,
+                    language=forced_language,
+                    task="transcribe"
+                )
+                transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         else:
+            # Use MMS model for other Philippine languages
+            logger.info(f"Using MMS model for language: {source_code}")
+            # Prepare audio for MMS
+            inputs = mms_processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
+            logger.info("Audio processed for MMS, generating transcription...")
             with torch.no_grad():
+                # Process with MMS
+                logits = mms_model(**inputs).logits
                 predicted_ids = torch.argmax(logits, dim=-1)
+                transcription = mms_processor.batch_decode(predicted_ids)[0]
         logger.info(f"Transcription completed: {transcription}")
+        # Step 4: Translate the transcribed text (MT)
         if model_status["mt"] == "loaded" and mt_model is not None and mt_tokenizer is not None:
             try:
                 source_nllb_code = NLLB_LANGUAGE_CODES[source_code]
         else:
             logger.warning("MT model not loaded, skipping translation")
+        # Step 5: Check for inappropriate content
         is_inappropriate = check_inappropriate_content(transcription) or check_inappropriate_content(translated_text)
         if is_inappropriate:
             logger.warning("Inappropriate content detected in audio transcription or translation")
+        # Step 6: Convert translated text to speech (TTS)
         if load_tts_model_for_language(target_code):
             try:
                 output_path, error = synthesize_speech(translated_text, target_code)
                 if output_path:
                     output_filename = os.path.basename(output_path)
+                    output_audio_url = f"https://jerich-talklasapp.hf.space/audio_output/{output_filename}"
                     logger.info("TTS conversion completed")
             except Exception as e:
                 logger.error(f"Error during TTS conversion: {str(e)}")
 @app.post("/text-to-speech")
 async def text_to_speech(text: str = Form(...), target_lang: str = Form(...)):
+    """Endpoint to convert text to speech in the specified language"""
     if not text:
         raise HTTPException(status_code=400, detail="No text provided")
     if target_lang not in LANGUAGE_MAPPING:
     request_id = str(uuid.uuid4())
     target_code = LANGUAGE_MAPPING[target_lang]
+    # Check for inappropriate content
     is_inappropriate = check_inappropriate_content(text)
     if is_inappropriate:
         logger.warning("Inappropriate content detected in text-to-speech request")
+    # Synthesize speech
     output_audio_url = None
     if model_status["tts"].startswith("loaded") or load_tts_model_for_language(target_code):
         try:
             output_path, error = synthesize_speech(text, target_code)
             if output_path:
                 output_filename = os.path.basename(output_path)
+                output_audio_url = f"https://jerich-talklasapp.hf.space/audio_output/{output_filename}"
                 logger.info("TTS conversion completed")
             else:
                 logger.error(f"TTS conversion failed: {error}")