Spaces:
Sleeping
Sleeping
Fix Whisper language handling for Tagalog in translate-audio endpoint
Browse files- Added WHISPER_LANGUAGE_MAPPING to convert ISO codes (eng, tgl) to Whisper-compatible language names (english, tagalog)
- Updated /translate-audio endpoint to use correct language names for Whisper model
- Removed forced_decoder_ids to resolve conflict with language parameter
- Ensured default fallback to English for unmapped languages
- Addresses error: "Unsupported language: tgl" and forced_decoder_ids warning
app.py
CHANGED
@@ -17,7 +17,7 @@ from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTa
|
|
17 |
from fastapi.responses import JSONResponse
|
18 |
from fastapi.staticfiles import StaticFiles
|
19 |
from typing import Dict, Any, Optional, Tuple, List
|
20 |
-
from
|
21 |
|
22 |
# Configure logging
|
23 |
logging.basicConfig(level=logging.INFO)
|
@@ -62,6 +62,12 @@ LANGUAGE_MAPPING = {
|
|
62 |
"Pangasinan": "pag"
|
63 |
}
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
NLLB_LANGUAGE_CODES = {
|
66 |
"eng": "eng_Latn",
|
67 |
"tgl": "tgl_Latn",
|
@@ -294,7 +300,7 @@ def synthesize_speech(text: str, target_code: str) -> Tuple[Optional[str], Optio
|
|
294 |
return None, "Failed to load TTS model for the target language"
|
295 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
296 |
try:
|
297 |
-
inputs = tts_tokenizer(text, return_tensors="pt").
|
298 |
with torch.no_grad():
|
299 |
output = tts_model(**inputs)
|
300 |
speech = output.waveform.cpu().numpy().squeeze()
|
@@ -463,9 +469,10 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
|
|
463 |
|
464 |
if use_whisper:
|
465 |
logger.info("Using Whisper model for transcription")
|
|
|
466 |
inputs = stt_processor_whisper(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
|
467 |
with torch.no_grad():
|
468 |
-
generated_ids = stt_model_whisper.generate(**inputs, language=
|
469 |
transcription = stt_processor_whisper.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
470 |
else:
|
471 |
logger.info("Using MMS model for transcription")
|
|
|
17 |
from fastapi.responses import JSONResponse
|
18 |
from fastapi.staticfiles import StaticFiles
|
19 |
from typing import Dict, Any, Optional, Tuple, List
|
20 |
+
from(datetime import datetime, timedelta)
|
21 |
|
22 |
# Configure logging
|
23 |
logging.basicConfig(level=logging.INFO)
|
|
|
62 |
"Pangasinan": "pag"
|
63 |
}
|
64 |
|
65 |
+
# Mapping for Whisper language names
|
66 |
+
WHISPER_LANGUAGE_MAPPING = {
|
67 |
+
"eng": "english",
|
68 |
+
"tgl": "tagalog"
|
69 |
+
}
|
70 |
+
|
71 |
NLLB_LANGUAGE_CODES = {
|
72 |
"eng": "eng_Latn",
|
73 |
"tgl": "tgl_Latn",
|
|
|
300 |
return None, "Failed to load TTS model for the target language"
|
301 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
302 |
try:
|
303 |
+
inputs = tts_tokenizer(text, return_tensors="pt").toagli(device)
|
304 |
with torch.no_grad():
|
305 |
output = tts_model(**inputs)
|
306 |
speech = output.waveform.cpu().numpy().squeeze()
|
|
|
469 |
|
470 |
if use_whisper:
|
471 |
logger.info("Using Whisper model for transcription")
|
472 |
+
whisper_lang = WHISPER_LANGUAGE_MAPPING.get(source_code, "english") # Default to English if not mapped
|
473 |
inputs = stt_processor_whisper(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
|
474 |
with torch.no_grad():
|
475 |
+
generated_ids = stt_model_whisper.generate(**inputs, language=whisper_lang)
|
476 |
transcription = stt_processor_whisper.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
477 |
else:
|
478 |
logger.info("Using MMS model for transcription")
|