Jerich commited on
Commit
dec41a9
·
verified ·
1 Parent(s): 150f666

Fix Whisper language handling for Tagalog in translate-audio endpoint

Browse files

- Added WHISPER_LANGUAGE_MAPPING to convert ISO codes (eng, tgl) to Whisper-compatible language names (english, tagalog)
- Updated /translate-audio endpoint to use correct language names for Whisper model
- Removed forced_decoder_ids to resolve conflict with language parameter
- Ensured default fallback to English for unmapped languages
- Addresses error: "Unsupported language: tgl" and forced_decoder_ids warning

Files changed (1) hide show
  1. app.py +10 -3
app.py CHANGED
@@ -17,7 +17,7 @@ from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTa
17
  from fastapi.responses import JSONResponse
18
  from fastapi.staticfiles import StaticFiles
19
  from typing import Dict, Any, Optional, Tuple, List
20
- from datetime import datetime, timedelta
21
 
22
  # Configure logging
23
  logging.basicConfig(level=logging.INFO)
@@ -62,6 +62,12 @@ LANGUAGE_MAPPING = {
62
  "Pangasinan": "pag"
63
  }
64
 
 
 
 
 
 
 
65
  NLLB_LANGUAGE_CODES = {
66
  "eng": "eng_Latn",
67
  "tgl": "tgl_Latn",
@@ -294,7 +300,7 @@ def synthesize_speech(text: str, target_code: str) -> Tuple[Optional[str], Optio
294
  return None, "Failed to load TTS model for the target language"
295
  device = "cuda" if torch.cuda.is_available() else "cpu"
296
  try:
297
- inputs = tts_tokenizer(text, return_tensors="pt").to(device)
298
  with torch.no_grad():
299
  output = tts_model(**inputs)
300
  speech = output.waveform.cpu().numpy().squeeze()
@@ -463,9 +469,10 @@ async def translate_audio(audio: UploadFile = File(...), source_lang: str = Form
463
 
464
  if use_whisper:
465
  logger.info("Using Whisper model for transcription")
 
466
  inputs = stt_processor_whisper(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
467
  with torch.no_grad():
468
- generated_ids = stt_model_whisper.generate(**inputs, language=source_code)
469
  transcription = stt_processor_whisper.batch_decode(generated_ids, skip_special_tokens=True)[0]
470
  else:
471
  logger.info("Using MMS model for transcription")
 
17
  from fastapi.responses import JSONResponse
18
  from fastapi.staticfiles import StaticFiles
19
  from typing import Dict, Any, Optional, Tuple, List
20
+ from(datetime import datetime, timedelta)
21
 
22
  # Configure logging
23
  logging.basicConfig(level=logging.INFO)
 
62
  "Pangasinan": "pag"
63
  }
64
 
65
+ # Mapping for Whisper language names
66
+ WHISPER_LANGUAGE_MAPPING = {
67
+ "eng": "english",
68
+ "tgl": "tagalog"
69
+ }
70
+
71
  NLLB_LANGUAGE_CODES = {
72
  "eng": "eng_Latn",
73
  "tgl": "tgl_Latn",
 
300
  return None, "Failed to load TTS model for the target language"
301
  device = "cuda" if torch.cuda.is_available() else "cpu"
302
  try:
303
+ inputs = tts_tokenizer(text, return_tensors="pt").toagli(device)
304
  with torch.no_grad():
305
  output = tts_model(**inputs)
306
  speech = output.waveform.cpu().numpy().squeeze()
 
469
 
470
  if use_whisper:
471
  logger.info("Using Whisper model for transcription")
472
+ whisper_lang = WHISPER_LANGUAGE_MAPPING.get(source_code, "english") # Default to English if not mapped
473
  inputs = stt_processor_whisper(waveform.numpy(), sampling_rate=16000, return_tensors="pt").to(device)
474
  with torch.no_grad():
475
+ generated_ids = stt_model_whisper.generate(**inputs, language=whisper_lang)
476
  transcription = stt_processor_whisper.batch_decode(generated_ids, skip_special_tokens=True)[0]
477
  else:
478
  logger.info("Using MMS model for transcription")