Spaces:

Jerich
/

TalklasApp2

Sleeping

App Files Files Community

Jerich commited on 15 days ago

Commit

dec41a9

verified ·

1 Parent(s): 150f666

Fix Whisper language handling for Tagalog in translate-audio endpoint

Browse files

- Added WHISPER_LANGUAGE_MAPPING to convert ISO codes (eng, tgl) to Whisper-compatible language names (english, tagalog)
- Updated /translate-audio endpoint to use correct language names for Whisper model
- Removed forced_decoder_ids to resolve conflict with language parameter
- Ensured default fallback to English for unmapped languages
- Addresses error: "Unsupported language: tgl" and forced_decoder_ids warning

Files changed (1) hide show

app.py +10 -3

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTa
 from fastapi.responses import JSONResponse
 from fastapi.staticfiles import StaticFiles
 from typing import Dict, Any, Optional, Tuple, List
-from datetime import datetime, timedelta
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -62,6 +62,12 @@ LANGUAGE_MAPPING = {
     "Pangasinan": "pag"
 }
 NLLB_LANGUAGE_CODES = {
     "eng": "eng_Latn",
     "tgl": "tgl_Latn",
@@ -294,7 +300,7 @@ def synthesize_speech(text: str, target_code: str) -> Tuple[Optional[str], Optio
         return None, "Failed to load TTS model for the target language"
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
-        inputs = tts_tokenizer(text, return_tensors="pt").to(device)
         with torch.no_grad():
             output = tts_model(**inputs)
         speech = output.waveform.cpu().numpy().squeeze()
@@ -463,9 +469,10 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
         if use_whisper:
             logger.info("Using Whisper model for transcription")
             inputs = stt_processor_whisper(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
             with torch.no_grad():
-                generated_ids = stt_model_whisper.generate(**inputs, language=source_code)
                 transcription = stt_processor_whisper.batch_decode(generated_ids, skip_special_tokens=True)[0]
         else:
             logger.info("Using MMS model for transcription")

 from fastapi.responses import JSONResponse
 from fastapi.staticfiles import StaticFiles
 from typing import Dict, Any, Optional, Tuple, List
+from(datetime import datetime, timedelta)
 # Configure logging
 logging.basicConfig(level=logging.INFO)
     "Pangasinan": "pag"
 }
+# Mapping for Whisper language names
+WHISPER_LANGUAGE_MAPPING = {
+    "eng": "english",
+    "tgl": "tagalog"
+}
 NLLB_LANGUAGE_CODES = {
     "eng": "eng_Latn",
     "tgl": "tgl_Latn",
         return None, "Failed to load TTS model for the target language"
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
+        inputs = tts_tokenizer(text, return_tensors="pt").toagli(device)
         with torch.no_grad():
             output = tts_model(**inputs)
         speech = output.waveform.cpu().numpy().squeeze()
         if use_whisper:
             logger.info("Using Whisper model for transcription")
+            whisper_lang = WHISPER_LANGUAGE_MAPPING.get(source_code, "english")  # Default to English if not mapped
             inputs = stt_processor_whisper(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
             with torch.no_grad():
+                generated_ids = stt_model_whisper.generate(**inputs, language=whisper_lang)
                 transcription = stt_processor_whisper.batch_decode(generated_ids, skip_special_tokens=True)[0]
         else:
             logger.info("Using MMS model for transcription")