Spaces:

Jerich
/

TalklasApp2

Sleeping

App Files Files Community

Jerich commited on 18 days ago

Commit

798e9af

verified ·

1 Parent(s): 0f3ec29

Update app.py

Browse files

Files changed (1) hide show

app.py +456 -350

app.py CHANGED Viewed

@@ -31,16 +31,20 @@ os.makedirs(AUDIO_DIR, exist_ok=True)
 app.mount("/audio_output", StaticFiles(directory=AUDIO_DIR), name="audio_output")
 # Global variables to track application state
-models_loaded = False
-loading_in_progress = False
-loading_thread = None
-model_status = {
-    "stt_mms": "not_loaded",
-    "stt_whisper_small": "not_loaded",
-    "mt": "not_loaded",
-    "tts": {}  # Will store status for each language
 }
-error_message = None
 # Define the valid languages and mappings
 LANGUAGE_MAPPING = {
@@ -61,30 +65,25 @@ NLLB_LANGUAGE_CODES = {
     "pag": "pag_Latn"
 }
-# Model dictionaries for different languages
-stt_models = {
-    "mms": None,
-    "mms_processor": None,
-    "whisper_small": None,
-    "whisper_small_processor": None
-}
-mt_model = None
-mt_tokenizer = None
-tts_models = {}  # Will store models for each language
-tts_tokenizers = {}  # Will store tokenizers for each language
-# List of inappropriate words/phrases for content filtering
-INAPPROPRIATE_WORDS = [
-    "fuck", "shit", "asshole", "bitch", "dick", "pussy", "cunt",
-    "whore", "slut", "bastard", "damn", "hell", "piss", "nigger",
-    "faggot", "retard", "crap", "porn", "sex", "penis", "vagina",
-    # Tagalog inappropriate words
-    "puta", "putangina", "gago", "bobo", "tanga", "tarantado",
-    "inutil", "ulol", "kantot", "jakol", "tite", "pekpek",
-    # Add more as needed
-]
 # Function to save PCM data as a WAV file
 def save_pcm_to_wav(pcm_data: list, sample_rate: int, output_path: str):
@@ -99,6 +98,7 @@ def save_pcm_to_wav(pcm_data: list, sample_rate: int, output_path: str):
         # Write the 16-bit PCM data as bytes (little-endian)
         wav_file.writeframes(pcm_array.tobytes())
 # Function to detect speech using an energy-based approach
 def detect_speech(waveform: torch.Tensor, sample_rate: int, threshold: float = 0.01, min_speech_duration: float = 0.5) -> bool:
     """
@@ -123,52 +123,6 @@ def detect_speech(waveform: torch.Tensor, sample_rate: int, threshold: float = 0
     # For now, we assume if RMS is above threshold, there is speech
     return True
-# Function to check for inappropriate content
-def check_inappropriate_content(text: str) -> bool:
-    """
-    Checks if the text contains inappropriate content.
-    Returns True if inappropriate content is detected, False otherwise.
-    """
-    # Convert text to lowercase for case-insensitive matching
-    text_lower = text.lower()
-    # Check if any inappropriate word is in the text
-    for word in INAPPROPRIATE_WORDS:
-        # Use word boundary regex to match whole words only
-        pattern = r'\b' + re.escape(word) + r'\b'
-        if re.search(pattern, text_lower):
-            logger.warning(f"Inappropriate content detected: '{word}'")
-            return True
-    return False
-# Function to perform text-to-speech conversion
-def text_to_speech(text: str, language_code: str) -> Tuple[Optional[np.ndarray], Optional[int], Optional[str]]:
-    """
-    Convert text to speech using the appropriate TTS model.
-    Returns the speech waveform, sample rate, and any error message.
-    """
-    if language_code not in tts_models or tts_models[language_code] is None:
-        error_msg = f"TTS model for {language_code} not loaded"
-        logger.error(error_msg)
-        return None, None, error_msg
-    try:
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        inputs = tts_tokenizers[language_code](text, return_tensors="pt").to(device)
-        with torch.no_grad():
-            output = tts_models[language_code](**inputs)
-        speech = output.waveform.cpu().numpy().squeeze()
-        speech = (speech * 32767).astype(np.int16)
-        sample_rate = tts_models[language_code].config.sampling_rate
-        return speech, sample_rate, None
-    except Exception as e:
-        error_msg = f"Error during TTS conversion: {str(e)}"
-        logger.error(error_msg)
-        return None, None, error_msg
 # Function to clean up old audio files
 def cleanup_old_audio_files():
@@ -185,142 +139,157 @@ def cleanup_old_audio_files():
                 except Exception as e:
                     logger.error(f"Error deleting file {file_path}: {str(e)}")
 # Background task to periodically clean up audio files
 def schedule_cleanup():
     while True:
         cleanup_old_audio_files()
         time.sleep(300)  # Run every 5 minutes (300 seconds)
-# Function to load models in background
-def load_models_task():
-    global models_loaded, loading_in_progress, model_status, error_message
-    global stt_models, mt_model, mt_tokenizer, tts_models, tts_tokenizers
     try:
-        loading_in_progress = True
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Load STT models (both MMS and Whisper)
-        logger.info("Starting to load STT models...")
-        # Load MMS STT model
-        try:
-            logger.info("Loading MMS STT model...")
-            model_status["stt_mms"] = "loading"
-            from transformers import AutoProcessor, AutoModelForCTC
-            stt_models["mms_processor"] = AutoProcessor.from_pretrained("facebook/mms-1b-all")
-            stt_models["mms"] = AutoModelForCTC.from_pretrained("facebook/mms-1b-all")
-            stt_models["mms"].to(device)
-            logger.info("MMS STT model loaded successfully")
-            model_status["stt_mms"] = "loaded"
-        except Exception as mms_error:
-            logger.error(f"Failed to load MMS STT model: {str(mms_error)}")
-            model_status["stt_mms"] = "failed"
-            error_message = f"MMS STT model loading failed: {str(mms_error)}"
-        # Load Whisper Small STT model
-        try:
-            logger.info("Loading Whisper Small STT model...")
-            model_status["stt_whisper_small"] = "loading"
-            from transformers import WhisperProcessor, WhisperForConditionalGeneration
-            stt_models["whisper_small_processor"] = WhisperProcessor.from_pretrained("openai/whisper-small")
-            stt_models["whisper_small"] = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
-            stt_models["whisper_small"].to(device)
-            logger.info("Whisper Small STT model loaded successfully")
-            model_status["stt_whisper_small"] = "loaded"
-        except Exception as whisper_error:
-            logger.error(f"Failed to load Whisper Small STT model: {str(whisper_error)}")
-            model_status["stt_whisper_small"] = "failed"
-            error_message = f"Whisper Small STT model loading failed: {str(whisper_error)}"
-        # Load MT model
-        logger.info("Starting to load MT model...")
-        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-        try:
-            logger.info("Loading NLLB-200-distilled-600M model...")
-            model_status["mt"] = "loading"
-            mt_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
-            mt_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
-            mt_model.to(device)
-            logger.info("MT model loaded successfully")
-            model_status["mt"] = "loaded"
-        except Exception as e:
-            logger.error(f"Failed to load MT model: {str(e)}")
-            model_status["mt"] = "failed"
-            error_message = f"MT model loading failed: {str(e)}"
-        # Load TTS models for all supported languages
-        logger.info("Starting to load TTS models for all languages...")
-        from transformers import VitsModel, AutoTokenizer
-        for lang_name, lang_code in LANGUAGE_MAPPING.items():
-            try:
-                logger.info(f"Loading MMS-TTS model for {lang_name} ({lang_code})...")
-                model_status["tts"][lang_code] = "loading"
-                # Load the model and tokenizer
-                tts_models[lang_code] = VitsModel.from_pretrained(f"facebook/mms-tts-{lang_code}")
-                tts_tokenizers[lang_code] = AutoTokenizer.from_pretrained(f"facebook/mms-tts-{lang_code}")
-                # Move to GPU if available
-                tts_models[lang_code].to(device)
-                logger.info(f"TTS model for {lang_name} loaded successfully")
-                model_status["tts"][lang_code] = "loaded"
-            except Exception as e:
-                logger.error(f"Failed to load TTS model for {lang_name}: {str(e)}")
-                model_status["tts"][lang_code] = "failed"
-                # Try to load English as fallback if this is not English
-                if lang_code != "eng":
-                    try:
-                        logger.info(f"Trying to load English TTS model as fallback for {lang_name}...")
-                        # Only load English model once if not already loaded
-                        if "eng" not in tts_models or tts_models["eng"] is None:
-                            tts_models["eng"] = VitsModel.from_pretrained("facebook/mms-tts-eng")
-                            tts_tokenizers["eng"] = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
-                            tts_models["eng"].to(device)
-                            model_status["tts"]["eng"] = "loaded"
-                        # Point this language to use English model
-                        tts_models[lang_code] = tts_models["eng"]
-                        tts_tokenizers[lang_code] = tts_tokenizers["eng"]
-                        model_status["tts"][lang_code] = "loaded (fallback to eng)"
-                    except Exception as e2:
-                        logger.error(f"Failed to load English fallback TTS model: {str(e2)}")
-                        model_status["tts"][lang_code] = "failed (with fallback)"
-        # Set models_loaded flag based on which critical models are loaded
-        # Consider the system usable if we have at least one STT model, the MT model, and at least one TTS model
-        stt_loaded = model_status["stt_mms"] == "loaded" or model_status["stt_whisper_small"] == "loaded"
-        mt_loaded = model_status["mt"] == "loaded"
-        any_tts_loaded = any(status == "loaded" or status.startswith("loaded (fallback")
-                             for status in model_status["tts"].values())
-        models_loaded = stt_loaded and mt_loaded and any_tts_loaded
-        if models_loaded:
-            logger.info("Critical models loaded successfully - system is ready")
-        else:
-            logger.warning("Some critical models failed to load - system may have limited functionality")
     except Exception as e:
-        error_message = str(e)
-        logger.error(f"Error in model loading task: {str(e)}")
     finally:
-        loading_in_progress = False
-# Start loading models in background
-def start_model_loading():
-    global loading_thread, loading_in_progress
-    if not loading_in_progress:
-        loading_in_progress = True
-        loading_thread = threading.Thread(target=load_models_task)
-        loading_thread.daemon = True
-        loading_thread.start()
 # Start the background cleanup task
 def start_cleanup_task():
@@ -328,88 +297,130 @@ def start_cleanup_task():
     cleanup_thread.daemon = True
     cleanup_thread.start()
 # Start the background processes when the app starts
 @app.on_event("startup")
 async def startup_event():
     logger.info("Application starting up...")
-    start_model_loading()
     start_cleanup_task()
 @app.get("/")
 async def root():
     """Root endpoint for default health check"""
     logger.info("Root endpoint requested")
     return {"status": "healthy"}
 @app.get("/health")
 async def health_check():
     """Health check endpoint that always returns successfully"""
-    global models_loaded, loading_in_progress, model_status, error_message
     logger.info("Health check requested")
     return {
         "status": "healthy",
-        "models_loaded": models_loaded,
-        "loading_in_progress": loading_in_progress,
-        "model_status": model_status,
-        "error": error_message
     }
 @app.post("/synthesize-speech")
 async def synthesize_speech(text: str = Form(...), language: str = Form(...)):
     """Endpoint to synthesize speech from text without translation"""
     if language not in LANGUAGE_MAPPING:
         raise HTTPException(status_code=400, detail="Invalid language selected")
-    logger.info(f"Speech synthesis requested for text in {language}")
-    request_id = str(uuid.uuid4())
     language_code = LANGUAGE_MAPPING[language]
-    # Check if the TTS model is loaded
-    if language_code not in tts_models or tts_models[language_code] is None:
         return {
             "request_id": request_id,
-            "status": "failed",
-            "message": f"TTS model for {language} not loaded yet",
-            "output_audio": None,
-            "is_inappropriate": False
         }
-    # Check for inappropriate content
-    is_inappropriate = check_inappropriate_content(text)
-    # Generate speech
-    speech, sample_rate, error = text_to_speech(text, language_code)
-    if error:
         return {
             "request_id": request_id,
             "status": "failed",
-            "message": error,
-            "output_audio": None,
-            "is_inappropriate": is_inappropriate
         }
-    # Save the synthesized audio
-    output_filename = f"{request_id}.wav"
-    output_path = os.path.join(AUDIO_DIR, output_filename)
-    save_pcm_to_wav(speech.tolist(), sample_rate, output_path)
-    # Generate URL to the WAV file
-    output_audio_url = f"https://jerich-talklasapp2.hf.space/audio_output/{output_filename}"
-    return {
-        "request_id": request_id,
-        "status": "completed",
-        "message": "Speech synthesis completed",
-        "output_audio": output_audio_url,
-        "is_inappropriate": is_inappropriate
-    }
 @app.post("/translate-text")
 async def translate_text(text: str = Form(...), source_lang: str = Form(...), target_lang: str = Form(...)):
     """Endpoint to translate text and convert to speech"""
-    global mt_model, mt_tokenizer
     if not text:
         raise HTTPException(status_code=400, detail="No text provided")
     if source_lang not in LANGUAGE_MAPPING or target_lang not in LANGUAGE_MAPPING:
@@ -418,64 +429,107 @@ async def translate_text(text: str = Form(...), source_lang: str = Form(...), ta
     logger.info(f"Translate-text requested: {text} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
     # Translate the text
     source_code = LANGUAGE_MAPPING[source_lang]
     target_code = LANGUAGE_MAPPING[target_lang]
     translated_text = "Translation not available"
-    if model_status["mt"] == "loaded" and mt_model is not None and mt_tokenizer is not None:
-        try:
-            source_nllb_code = NLLB_LANGUAGE_CODES[source_code]
-            target_nllb_code = NLLB_LANGUAGE_CODES[target_code]
-            mt_tokenizer.src_lang = source_nllb_code
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            inputs = mt_tokenizer(text, return_tensors="pt").to(device)
-            with torch.no_grad():
-                generated_tokens = mt_model.generate(
-                    **inputs,
-                    forced_bos_token_id=mt_tokenizer.convert_tokens_to_ids(target_nllb_code),
-                    max_length=448
-                )
-            translated_text = mt_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
-            logger.info(f"Translation completed: {translated_text}")
-        except Exception as e:
-            logger.error(f"Error during translation: {str(e)}")
-            translated_text = f"Translation failed: {str(e)}"
-    else:
-        logger.warning("MT model not loaded, skipping translation")
-    # Check for inappropriate content in the translation
-    is_inappropriate = check_inappropriate_content(translated_text)
     # Convert translated text to speech
-    speech, sample_rate, error = text_to_speech(translated_text, target_code)
     output_audio_url = None
-    if speech is not None and sample_rate is not None:
         # Save the audio as a WAV file
         output_filename = f"{request_id}.wav"
         output_path = os.path.join(AUDIO_DIR, output_filename)
         save_pcm_to_wav(speech.tolist(), sample_rate, output_path)
         # Generate a URL to the WAV file
-        output_audio_url = f"https://jerich-talklasapp2.hf.space/audio_output/{output_filename}"
         logger.info("TTS conversion completed")
     return {
         "request_id": request_id,
-        "status": "completed",
-        "message": "Translation and TTS completed (or partially completed).",
         "source_text": text,
         "translated_text": translated_text,
         "output_audio": output_audio_url,
-        "is_inappropriate": is_inappropriate
     }
 @app.post("/translate-audio")
 async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form(...), target_lang: str = Form(...)):
     """Endpoint to transcribe, translate, and convert audio to speech"""
-    global stt_models, mt_model, mt_tokenizer
     if not audio:
         raise HTTPException(status_code=400, detail="No audio file provided")
     if source_lang not in LANGUAGE_MAPPING or target_lang not in LANGUAGE_MAPPING:
@@ -484,38 +538,35 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
     logger.info(f"Translate-audio requested: {audio.filename} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
-    # Check if appropriate STT model is loaded
     source_code = LANGUAGE_MAPPING[source_lang]
-    use_whisper = source_code in ["eng", "tgl"]  # Use Whisper for English or Tagalog
-    if use_whisper and (model_status["stt_whisper_small"] != "loaded" or stt_models["whisper_small"] is None):
-        logger.warning("Whisper Small STT model not loaded for English/Tagalog, checking MMS")
-        if model_status["stt_mms"] != "loaded" or stt_models["mms"] is None:
-            logger.warning("MMS STT model not loaded either, returning placeholder response")
             return {
                 "request_id": request_id,
-                "status": "processing",
-                "message": "STT models not loaded yet. Please try again later.",
-                "source_text": "Transcription not available",
-                "translated_text": "Translation not available",
                 "output_audio": None,
-                "is_inappropriate": False
             }
-        use_whisper = False  # Fall back to MMS
-    elif not use_whisper and (model_status["stt_mms"] != "loaded" or stt_models["mms"] is None):
-        logger.warning("MMS STT model not loaded for non-English/Tagalog, checking Whisper")
-        if model_status["stt_whisper_small"] != "loaded" or stt_models["whisper_small"] is None:
-            logger.warning("Whisper Small STT model not loaded either, returning placeholder response")
             return {
                 "request_id": request_id,
-                "status": "processing",
-                "message": "STT models not loaded yet. Please try again later.",
-                "source_text": "Transcription not available",
-                "translated_text": "Translation not available",
                 "output_audio": None,
-                "is_inappropriate": False
             }
-        use_whisper = True  # Fall back to Whisper
     # Save the uploaded audio to a temporary file
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
@@ -525,7 +576,7 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
     transcription = "Transcription not available"
     translated_text = "Translation not available"
     output_audio_url = None
-    is_inappropriate = False
     try:
         # Step 1: Load and resample the audio using torchaudio
@@ -549,94 +600,132 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
                 "source_text": "No speech detected",
                 "translated_text": "No translation available",
                 "output_audio": None,
-                "is_inappropriate": False
             }
         # Step 3: Transcribe the audio (STT)
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        logger.info(f"Using device: {device} with {'Whisper' if use_whisper else 'MMS'} model")
         if use_whisper:
-            # Use Whisper Small for English or Tagalog
-            logger.info("Using Whisper Small for transcription")
-            processor = stt_models["whisper_small_processor"]
-            model = stt_models["whisper_small"]
-            inputs = processor(waveform.numpy()[0], sampling_rate=16000, return_tensors="pt").to(device)
             with torch.no_grad():
-                # Use the language code for forced decoding if source is English or Tagalog
-                language = "en" if source_code == "eng" else "tl" if source_code == "tgl" else None
-                generated_ids = model.generate(
-                    **inputs,
-                    language=language,
-                    task="transcribe"
-                )
-                transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         else:
-            # Use MMS for other languages
-            logger.info("Using MMS for transcription")
-            processor = stt_models["mms_processor"]
-            model = stt_models["mms"]
-            if source_code in processor.tokenizer.vocab.keys():
-                processor.tokenizer.set_target_lang(source_code)
-                model.load_adapter(source_code)
-            inputs = processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
             with torch.no_grad():
-                logits = model(**inputs).logits
                 predicted_ids = torch.argmax(logits, dim=-1)
-                transcription = processor.batch_decode(predicted_ids)[0]
         logger.info(f"Transcription completed: {transcription}")
-        # Step 4: Translate the transcribed text (MT)
-        target_code = LANGUAGE_MAPPING[target_lang]
-        if model_status["mt"] == "loaded" and mt_model is not None and mt_tokenizer is not None:
-            try:
-                source_nllb_code = NLLB_LANGUAGE_CODES[source_code]
-                target_nllb_code = NLLB_LANGUAGE_CODES[target_code]
-                mt_tokenizer.src_lang = source_nllb_code
-                inputs = mt_tokenizer(transcription, return_tensors="pt").to(device)
-                with torch.no_grad():
-                    generated_tokens = mt_model.generate(
-                        **inputs,
-                        forced_bos_token_id=mt_tokenizer.convert_tokens_to_ids(target_nllb_code),
-                        max_length=448
-                    )
-                translated_text = mt_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
-                logger.info(f"Translation completed: {translated_text}")
-            except Exception as e:
-                logger.error(f"Error during translation: {str(e)}")
-                translated_text = f"Translation failed: {str(e)}"
-        else:
-            logger.warning("MT model not loaded, skipping translation")
-        # Step 5: Check for inappropriate content in the translation
-        is_inappropriate = check_inappropriate_content(translated_text)
-        # Step 6: Convert translated text to speech (TTS)
-        speech, sample_rate, error = text_to_speech(translated_text, target_code)
-        if speech is not None and sample_rate is not None:
             # Save the audio as a WAV file
             output_filename = f"{request_id}.wav"
             output_path = os.path.join(AUDIO_DIR, output_filename)
             save_pcm_to_wav(speech.tolist(), sample_rate, output_path)
             # Generate a URL to the WAV file
-            output_audio_url = f"https://jerich-talklasapp2.hf.space/audio_output/{output_filename}"
             logger.info("TTS conversion completed")
         return {
             "request_id": request_id,
-            "status": "completed",
-            "message": "Transcription, translation, and TTS completed (or partially completed).",
             "source_text": transcription,
             "translated_text": translated_text,
             "output_audio": output_audio_url,
-            "is_inappropriate": is_inappropriate
         }
     except Exception as e:
         logger.error(f"Error during processing: {str(e)}")
@@ -647,11 +736,28 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
             "source_text": transcription,
             "translated_text": translated_text,
             "output_audio": output_audio_url,
-            "is_inappropriate": is_inappropriate
         }
     finally:
         logger.info(f"Cleaning up temporary file: {temp_path}")
-        os.unlink(temp_path)
 if __name__ == "__main__":
     import uvicorn

 app.mount("/audio_output", StaticFiles(directory=AUDIO_DIR), name="audio_output")
 # Global variables to track application state
+model_cache = {
+    "stt_whisper": {"model": None, "processor": None, "status": "not_loaded"},
+    "stt_mms": {"model": None, "processor": None, "status": "not_loaded"},
+    "mt": {"model": None, "tokenizer": None, "status": "not_loaded"},
+    "tts": {"model": None, "tokenizer": None, "status": "not_loaded", "language": None}
+}
+# Track loading status
+loading_locks = {
+    "stt_whisper": threading.Lock(),
+    "stt_mms": threading.Lock(),
+    "mt": threading.Lock(),
+    "tts": threading.Lock()
 }
 # Define the valid languages and mappings
 LANGUAGE_MAPPING = {
     "pag": "pag_Latn"
 }
+# Inappropriate words list - this is a basic implementation
+# In a production environment, you would use a more comprehensive solution
+INAPPROPRIATE_WORDS = [
+    "putang", "tang ina", "gago", "puta", "bobo", "ulol", "pakyu", "tae",
+    "obscenity", "profanity", "explicit", "nsfw", "offensive"
+]
+# Function to detect inappropriate content
+def detect_inappropriate_content(text: str) -> bool:
+    """
+    Checks if the text contains any inappropriate words
+    """
+    text_lower = text.lower()
+    for word in INAPPROPRIATE_WORDS:
+        if word in text_lower:
+            return True
+    return False
 # Function to save PCM data as a WAV file
 def save_pcm_to_wav(pcm_data: list, sample_rate: int, output_path: str):
         # Write the 16-bit PCM data as bytes (little-endian)
         wav_file.writeframes(pcm_array.tobytes())
 # Function to detect speech using an energy-based approach
 def detect_speech(waveform: torch.Tensor, sample_rate: int, threshold: float = 0.01, min_speech_duration: float = 0.5) -> bool:
     """
     # For now, we assume if RMS is above threshold, there is speech
     return True
 # Function to clean up old audio files
 def cleanup_old_audio_files():
                 except Exception as e:
                     logger.error(f"Error deleting file {file_path}: {str(e)}")
 # Background task to periodically clean up audio files
 def schedule_cleanup():
     while True:
         cleanup_old_audio_files()
         time.sleep(300)  # Run every 5 minutes (300 seconds)
+# Function to load the Whisper STT model on demand
+def load_whisper_model():
+    if model_cache["stt_whisper"]["status"] == "loaded":
+        return True
+    # Use lock to prevent multiple threads from loading the model simultaneously
+    if not loading_locks["stt_whisper"].acquire(blocking=False):
+        logger.info("Whisper model loading already in progress")
+        return False
     try:
+        logger.info("Loading Whisper small model...")
+        model_cache["stt_whisper"]["status"] = "loading"
+        from transformers import WhisperProcessor, WhisperForConditionalGeneration
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        model_cache["stt_whisper"]["processor"] = WhisperProcessor.from_pretrained("openai/whisper-small")
+        model_cache["stt_whisper"]["model"] = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
+        model_cache["stt_whisper"]["model"].to(device)
+        model_cache["stt_whisper"]["status"] = "loaded"
+        logger.info("Whisper small model loaded successfully")
+        return True
+    except Exception as e:
+        model_cache["stt_whisper"]["status"] = "failed"
+        logger.error(f"Failed to load Whisper model: {str(e)}")
+        return False
+    finally:
+        loading_locks["stt_whisper"].release()
+# Function to load the MMS STT model on demand
+def load_mms_stt_model():
+    if model_cache["stt_mms"]["status"] == "loaded":
+        return True
+    if not loading_locks["stt_mms"].acquire(blocking=False):
+        logger.info("MMS STT model loading already in progress")
+        return False
+    try:
+        logger.info("Loading MMS STT model...")
+        model_cache["stt_mms"]["status"] = "loading"
+        from transformers import AutoProcessor, AutoModelForCTC
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model_cache["stt_mms"]["processor"] = AutoProcessor.from_pretrained("facebook/mms-1b-all")
+        model_cache["stt_mms"]["model"] = AutoModelForCTC.from_pretrained("facebook/mms-1b-all")
+        model_cache["stt_mms"]["model"].to(device)
+        model_cache["stt_mms"]["status"] = "loaded"
+        logger.info("MMS STT model loaded successfully")
+        return True
+    except Exception as e:
+        model_cache["stt_mms"]["status"] = "failed"
+        logger.error(f"Failed to load MMS STT model: {str(e)}")
+        return False
+    finally:
+        loading_locks["stt_mms"].release()
+# Function to load the MT model on demand
+def load_mt_model():
+    if model_cache["mt"]["status"] == "loaded":
+        return True
+    if not loading_locks["mt"].acquire(blocking=False):
+        logger.info("MT model loading already in progress")
+        return False
+    try:
+        logger.info("Loading NLLB-200-distilled-600M model...")
+        model_cache["mt"]["status"] = "loading"
+        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model_cache["mt"]["model"] = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
+        model_cache["mt"]["tokenizer"] = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+        model_cache["mt"]["model"].to(device)
+        model_cache["mt"]["status"] = "loaded"
+        logger.info("MT model loaded successfully")
+        return True
+    except Exception as e:
+        model_cache["mt"]["status"] = "failed"
+        logger.error(f"Failed to load MT model: {str(e)}")
+        return False
+    finally:
+        loading_locks["mt"].release()
+# Function to load the TTS model for a specific language on demand
+def load_tts_model(language_code: str):
+    # If the model is already loaded for this language, return immediately
+    if (model_cache["tts"]["status"] == "loaded" and
+        model_cache["tts"]["language"] == language_code):
+        return True
+    if not loading_locks["tts"].acquire(blocking=False):
+        logger.info("TTS model loading already in progress")
+        return False
+    try:
+        logger.info(f"Loading MMS-TTS model for {language_code}...")
+        model_cache["tts"]["status"] = "loading"
+        from transformers import VitsModel, AutoTokenizer
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        try:
+            model_cache["tts"]["model"] = VitsModel.from_pretrained(f"facebook/mms-tts-{language_code}")
+            model_cache["tts"]["tokenizer"] = AutoTokenizer.from_pretrained(f"facebook/mms-tts-{language_code}")
+            model_cache["tts"]["model"].to(device)
+            model_cache["tts"]["language"] = language_code
+            model_cache["tts"]["status"] = "loaded"
+            logger.info(f"TTS model for {language_code} loaded successfully")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to load TTS model for {language_code}: {str(e)}")
+            # Fallback to English TTS if the target language fails
+            try:
+                logger.info("Falling back to MMS-TTS English model...")
+                model_cache["tts"]["model"] = VitsModel.from_pretrained("facebook/mms-tts-eng")
+                model_cache["tts"]["tokenizer"] = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
+                model_cache["tts"]["model"].to(device)
+                model_cache["tts"]["language"] = "eng"
+                model_cache["tts"]["status"] = "loaded (fallback)"
+                logger.info("Fallback TTS model loaded successfully")
+                return True
+            except Exception as e2:
+                model_cache["tts"]["status"] = "failed"
+                logger.error(f"Failed to load fallback TTS model: {str(e2)}")
+                return False
     except Exception as e:
+        model_cache["tts"]["status"] = "failed"
+        logger.error(f"Failed to setup TTS model: {str(e)}")
+        return False
     finally:
+        loading_locks["tts"].release()
 # Start the background cleanup task
 def start_cleanup_task():
     cleanup_thread.daemon = True
     cleanup_thread.start()
 # Start the background processes when the app starts
 @app.on_event("startup")
 async def startup_event():
     logger.info("Application starting up...")
     start_cleanup_task()
 @app.get("/")
 async def root():
     """Root endpoint for default health check"""
     logger.info("Root endpoint requested")
     return {"status": "healthy"}
 @app.get("/health")
 async def health_check():
     """Health check endpoint that always returns successfully"""
     logger.info("Health check requested")
     return {
         "status": "healthy",
+        "model_status": {
+            "stt_whisper": model_cache["stt_whisper"]["status"],
+            "stt_mms": model_cache["stt_mms"]["status"],
+            "mt": model_cache["mt"]["status"],
+            "tts": model_cache["tts"]["status"],
+            "tts_language": model_cache["tts"]["language"]
+        }
     }
+@app.post("/update-languages")
+async def update_languages(source_lang: str = Form(...), target_lang: str = Form(...)):
+    """
+    Update the language settings for translation services
+    Will trigger loading of necessary models if not already loaded
+    """
+    if source_lang not in LANGUAGE_MAPPING or target_lang not in LANGUAGE_MAPPING:
+        raise HTTPException(status_code=400, detail="Invalid language selected")
+    source_code = LANGUAGE_MAPPING[source_lang]
+    target_code = LANGUAGE_MAPPING[target_lang]
+    # Determine which STT model to use based on the source language
+    if source_code in ["eng", "tgl"]:
+        # Load Whisper for English or Tagalog
+        if not load_whisper_model():
+            return {"status": "pending", "message": "Whisper model loading in progress"}
+    else:
+        # Load MMS for other Philippine languages
+        if not load_mms_stt_model():
+            return {"status": "pending", "message": "MMS STT model loading in progress"}
+    # Load the MT model if not already loaded
+    if not load_mt_model():
+        return {"status": "pending", "message": "MT model loading in progress"}
+    # Load the appropriate TTS model for the target language
+    if not load_tts_model(target_code):
+        return {"status": "pending", "message": "TTS model loading in progress"}
+    logger.info(f"Languages updated to {source_lang} → {target_lang}")
+    return {"status": "success", "message": f"Languages updated to {source_lang} → {target_lang}"}
 @app.post("/synthesize-speech")
 async def synthesize_speech(text: str = Form(...), language: str = Form(...)):
     """Endpoint to synthesize speech from text without translation"""
     if language not in LANGUAGE_MAPPING:
         raise HTTPException(status_code=400, detail="Invalid language selected")
     language_code = LANGUAGE_MAPPING[language]
+    request_id = str(uuid.uuid4())
+    # Load the TTS model for the requested language
+    if not load_tts_model(language_code):
         return {
             "request_id": request_id,
+            "status": "pending",
+            "message": "TTS model loading in progress. Please try again in a moment."
         }
+    try:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        inputs = model_cache["tts"]["tokenizer"](text, return_tensors="pt").to(device)
+        with torch.no_grad():
+            output = model_cache["tts"]["model"](**inputs)
+        speech = output.waveform.cpu().numpy().squeeze()
+        speech = (speech * 32767).astype(np.int16)
+        sample_rate = model_cache["tts"]["model"].config.sampling_rate
+        # Save the audio as a WAV file
+        output_filename = f"{request_id}.wav"
+        output_path = os.path.join(AUDIO_DIR, output_filename)
+        save_pcm_to_wav(speech.tolist(), sample_rate, output_path)
+        logger.info(f"Saved synthesized audio to {output_path}")
+        # Generate a URL to the WAV file
+        output_audio_url = f"https://jerich-talklasapp.hf.space/audio_output/{output_filename}"
+        return {
+            "request_id": request_id,
+            "status": "completed",
+            "message": "Speech synthesis completed successfully",
+            "text": text,
+            "output_audio": output_audio_url
+        }
+    except Exception as e:
+        logger.error(f"Error during speech synthesis: {str(e)}")
         return {
             "request_id": request_id,
             "status": "failed",
+            "message": f"Speech synthesis failed: {str(e)}",
+            "text": text,
+            "output_audio": None
         }
 @app.post("/translate-text")
 async def translate_text(text: str = Form(...), source_lang: str = Form(...), target_lang: str = Form(...)):
     """Endpoint to translate text and convert to speech"""
     if not text:
         raise HTTPException(status_code=400, detail="No text provided")
     if source_lang not in LANGUAGE_MAPPING or target_lang not in LANGUAGE_MAPPING:
     logger.info(f"Translate-text requested: {text} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
+    # Load the MT model if not already loaded
+    if not load_mt_model():
+        return {
+            "request_id": request_id,
+            "status": "pending",
+            "message": "MT model loading in progress. Please try again in a moment.",
+            "source_text": text,
+            "translated_text": "Translation not available yet",
+            "output_audio": None,
+            "contains_inappropriate_content": False
+        }
     # Translate the text
     source_code = LANGUAGE_MAPPING[source_lang]
     target_code = LANGUAGE_MAPPING[target_lang]
     translated_text = "Translation not available"
+    contains_inappropriate = False
+    try:
+        source_nllb_code = NLLB_LANGUAGE_CODES[source_code]
+        target_nllb_code = NLLB_LANGUAGE_CODES[target_code]
+        model_cache["mt"]["tokenizer"].src_lang = source_nllb_code
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        inputs = model_cache["mt"]["tokenizer"](text, return_tensors="pt").to(device)
+        with torch.no_grad():
+            generated_tokens = model_cache["mt"]["model"].generate(
+                **inputs,
+                forced_bos_token_id=model_cache["mt"]["tokenizer"].convert_tokens_to_ids(target_nllb_code),
+                max_length=448
+            )
+        translated_text = model_cache["mt"]["tokenizer"].batch_decode(generated_tokens, skip_special_tokens=True)[0]
+        logger.info(f"Translation completed: {translated_text}")
+        # Check for inappropriate content
+        contains_inappropriate = detect_inappropriate_content(translated_text)
+        if contains_inappropriate:
+            logger.warning(f"Inappropriate content detected in translation")
+    except Exception as e:
+        logger.error(f"Error during translation: {str(e)}")
+        translated_text = f"Translation failed: {str(e)}"
+        return {
+            "request_id": request_id,
+            "status": "failed",
+            "message": f"Translation failed: {str(e)}",
+            "source_text": text,
+            "translated_text": translated_text,
+            "output_audio": None,
+            "contains_inappropriate_content": contains_inappropriate
+        }
+    # Load the TTS model for the target language
+    if not load_tts_model(target_code):
+        return {
+            "request_id": request_id,
+            "status": "partial",
+            "message": "Translation completed, but TTS model is loading. Please try again for audio.",
+            "source_text": text,
+            "translated_text": translated_text,
+            "output_audio": None,
+            "contains_inappropriate_content": contains_inappropriate
+        }
     # Convert translated text to speech
     output_audio_url = None
+    try:
+        inputs = model_cache["tts"]["tokenizer"](translated_text, return_tensors="pt").to(device)
+        with torch.no_grad():
+            output = model_cache["tts"]["model"](**inputs)
+        speech = output.waveform.cpu().numpy().squeeze()
+        speech = (speech * 32767).astype(np.int16)
+        sample_rate = model_cache["tts"]["model"].config.sampling_rate
         # Save the audio as a WAV file
         output_filename = f"{request_id}.wav"
         output_path = os.path.join(AUDIO_DIR, output_filename)
         save_pcm_to_wav(speech.tolist(), sample_rate, output_path)
+        logger.info(f"Saved synthesized audio to {output_path}")
         # Generate a URL to the WAV file
+        output_audio_url = f"https://jerich-talklasapp.hf.space/audio_output/{output_filename}"
         logger.info("TTS conversion completed")
+    except Exception as e:
+        logger.error(f"Error during TTS conversion: {str(e)}")
+        output_audio_url = None
     return {
         "request_id": request_id,
+        "status": "completed" if output_audio_url else "partial",
+        "message": "Translation and TTS completed" if output_audio_url else
+                  "Translation completed but TTS failed",
         "source_text": text,
         "translated_text": translated_text,
         "output_audio": output_audio_url,
+        "contains_inappropriate_content": contains_inappropriate
     }
 @app.post("/translate-audio")
 async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form(...), target_lang: str = Form(...)):
     """Endpoint to transcribe, translate, and convert audio to speech"""
     if not audio:
         raise HTTPException(status_code=400, detail="No audio file provided")
     if source_lang not in LANGUAGE_MAPPING or target_lang not in LANGUAGE_MAPPING:
     logger.info(f"Translate-audio requested: {audio.filename} from {source_lang} to {target_lang}")
     request_id = str(uuid.uuid4())
     source_code = LANGUAGE_MAPPING[source_lang]
+    target_code = LANGUAGE_MAPPING[target_lang]
+    # Determine which STT model to use based on source language
+    use_whisper = source_code in ["eng", "tgl"]
+    # Ensure the appropriate STT model is loaded
+    if use_whisper:
+        if not load_whisper_model():
             return {
                 "request_id": request_id,
+                "status": "pending",
+                "message": "Whisper STT model loading in progress. Please try again in a moment.",
+                "source_text": "Transcription not available yet",
+                "translated_text": "Translation not available yet",
                 "output_audio": None,
+                "contains_inappropriate_content": False
             }
+    else:
+        if not load_mms_stt_model():
             return {
                 "request_id": request_id,
+                "status": "pending",
+                "message": "MMS STT model loading in progress. Please try again in a moment.",
+                "source_text": "Transcription not available yet",
+                "translated_text": "Translation not available yet",
                 "output_audio": None,
+                "contains_inappropriate_content": False
             }
     # Save the uploaded audio to a temporary file
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
     transcription = "Transcription not available"
     translated_text = "Translation not available"
     output_audio_url = None
+    contains_inappropriate = False
     try:
         # Step 1: Load and resample the audio using torchaudio
                 "source_text": "No speech detected",
                 "translated_text": "No translation available",
                 "output_audio": None,
+                "contains_inappropriate_content": False
             }
         # Step 3: Transcribe the audio (STT)
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Using device: {device}")
         if use_whisper:
+            # Use Whisper for English/Tagalog
+            stt_processor = model_cache["stt_whisper"]["processor"]
+            stt_model = model_cache["stt_whisper"]["model"]
+            inputs = stt_processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
+            logger.info("Audio processed with Whisper, generating transcription...")
             with torch.no_grad():
+                generated_ids = stt_model.generate(**inputs, language="en" if source_code == "eng" else "tl")
+                transcription = stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         else:
+            # Use MMS for other Philippine languages
+            stt_processor = model_cache["stt_mms"]["processor"]
+            stt_model = model_cache["stt_mms"]["model"]
+            # Set the target language for MMS if supported
+            if source_code in stt_processor.tokenizer.vocab.keys():
+                stt_processor.tokenizer.set_target_lang(source_code)
+                stt_model.load_adapter(source_code)
+            inputs = stt_processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
+            logger.info("Audio processed with MMS, generating transcription...")
             with torch.no_grad():
+                logits = stt_model(**inputs).logits
                 predicted_ids = torch.argmax(logits, dim=-1)
+                transcription = stt_processor.batch_decode(predicted_ids)[0]
         logger.info(f"Transcription completed: {transcription}")
+        # Step 4: Load the MT model if not already loaded
+        if not load_mt_model():
+            return {
+                "request_id": request_id,
+                "status": "partial",
+                "message": "Transcription completed, but MT model is loading. Please try again for translation.",
+                "source_text": transcription,
+                "translated_text": "Translation not available yet",
+                "output_audio": None,
+                "contains_inappropriate_content": False
+            }
+        # Step 5: Translate the transcribed text (MT)
+        try:
+            source_nllb_code = NLLB_LANGUAGE_CODES[source_code]
+            target_nllb_code = NLLB_LANGUAGE_CODES[target_code]
+            model_cache["mt"]["tokenizer"].src_lang = source_nllb_code
+            inputs = model_cache["mt"]["tokenizer"](transcription, return_tensors="pt").to(device)
+            with torch.no_grad():
+                generated_tokens = model_cache["mt"]["model"].generate(
+                    **inputs,
+                    forced_bos_token_id=model_cache["mt"]["tokenizer"].convert_tokens_to_ids(target_nllb_code),
+                    max_length=448
+                )
+            translated_text = model_cache["mt"]["tokenizer"].batch_decode(generated_tokens, skip_special_tokens=True)[0]
+            logger.info(f"Translation completed: {translated_text}")
+            # Check for inappropriate content
+            contains_inappropriate = detect_inappropriate_content(translated_text)
+            if contains_inappropriate:
+                logger.warning(f"Inappropriate content detected in translation")
+        except Exception as e:
+            logger.error(f"Error during translation: {str(e)}")
+            translated_text = f"Translation failed: {str(e)}"
+            return {
+                "request_id": request_id,
+                "status": "partial",
+                "message": f"Transcription completed but translation failed: {str(e)}",
+                "source_text": transcription,
+                "translated_text": translated_text,
+                "output_audio": None,
+                "contains_inappropriate_content": False
+            }
+        # Step 6: Load the TTS model for the target language
+        if not load_tts_model(target_code):
+            return {
+                "request_id": request_id,
+                "status": "partial",
+                "message": "Transcription and translation completed, but TTS model is loading.",
+                "source_text": transcription,
+                "translated_text": translated_text,
+                "output_audio": None,
+                "contains_inappropriate_content": contains_inappropriate
+            }
+        # Step 7: Convert translated text to speech (TTS)
+        try:
+            inputs = model_cache["tts"]["tokenizer"](translated_text, return_tensors="pt").to(device)
+            with torch.no_grad():
+                output = model_cache["tts"]["model"](**inputs)
+            speech = output.waveform.cpu().numpy().squeeze()
+            speech = (speech * 32767).astype(np.int16)
+            sample_rate = model_cache["tts"]["model"].config.sampling_rate
             # Save the audio as a WAV file
             output_filename = f"{request_id}.wav"
             output_path = os.path.join(AUDIO_DIR, output_filename)
             save_pcm_to_wav(speech.tolist(), sample_rate, output_path)
+            logger.info(f"Saved synthesized audio to {output_path}")
             # Generate a URL to the WAV file
+            output_audio_url = f"https://jerich-talklasapp.hf.space/audio_output/{output_filename}"
             logger.info("TTS conversion completed")
+        except Exception as e:
+            logger.error(f"Error during TTS conversion: {str(e)}")
+            output_audio_url = None
         return {
             "request_id": request_id,
+            "status": "completed" if output_audio_url else "partial",
+            "message": "Transcription, translation, and TTS completed" if output_audio_url else
+                      "Transcription and translation completed but TTS failed",
             "source_text": transcription,
             "translated_text": translated_text,
             "output_audio": output_audio_url,
+            "contains_inappropriate_content": contains_inappropriate
         }
     except Exception as e:
         logger.error(f"Error during processing: {str(e)}")
             "source_text": transcription,
             "translated_text": translated_text,
             "output_audio": output_audio_url,
+            "contains_inappropriate_content": contains_inappropriate
         }
     finally:
         logger.info(f"Cleaning up temporary file: {temp_path}")
+        try:
+            os.unlink(temp_path)
+        except Exception as e:
+            logger.error(f"Error deleting temporary file: {str(e)}")
+# Add a method to check if text contains inappropriate content
+@app.post("/check-content")
+async def check_content(text: str = Form(...)):
+    """
+    Check if the provided text contains inappropriate content
+    """
+    contains_inappropriate = detect_inappropriate_content(text)
+    return {
+        "text": text,
+        "contains_inappropriate_content": contains_inappropriate
+    }
 if __name__ == "__main__":
     import uvicorn