Spaces:

Athspi-aitools
/

Aittsg

Running

App Files Files Community

Athspi commited on Apr 8

Commit

22004d7

verified ·

1 Parent(s): f5e7901

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -214

app.py CHANGED Viewed

@@ -1,253 +1,155 @@
-import os
-import uuid
-import tempfile
-import logging
 from fastapi import FastAPI, Query, HTTPException
-from fastapi.responses import FileResponse, JSONResponse
-# --- gTTS Imports ---
-from gtts import gTTS, gTTSError
-# --- Hugging Face Imports ---
-# !! These imports might take time on first run or if downloading models !!
-try:
-    from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
-    import torch
-    import scipy.io.wavfile
-    # import soundfile as sf # Alternative audio saving
-    print("Hugging Face Transformers loaded.")
-except ImportError:
-    print("Error: Required Hugging Face libraries (transformers, torch, onnxruntime, scipy) not found.")
-    print("Please install them: pip install -r requirements.txt")
-    # Exit if core HF libraries are missing, as the HF endpoint won't work
-    exit("Missing critical Hugging Face dependencies.")
-# --- Configuration & Setup ---
 TEMP_DIR = tempfile.gettempdir()
 os.makedirs(TEMP_DIR, exist_ok=True)
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# --- Hugging Face Model Loading ---
-# Load the model and processor once when the application starts
-# This can take time and memory!
-HF_MODEL_ID = "willwade/mms-tts-multilingual-models-onnx"
-hf_processor = None
-hf_model = None
-hf_supported_langs = set() # Will store supported ISO 639-3 codes
-try:
-    logger.info(f"Loading Hugging Face processor: {HF_MODEL_ID}...")
-    hf_processor = AutoProcessor.from_pretrained(HF_MODEL_ID)
-    logger.info(f"Loading Hugging Face model: {HF_MODEL_ID} (this may take time)...")
-    # Specify provider=['CPUExecutionProvider'] if you don't have CUDA or want to force CPU
-    hf_model = AutoModelForSpeechSeq2Seq.from_pretrained(HF_MODEL_ID, provider=['CPUExecutionProvider']) # Forces CPU via ONNX Runtime provider
-    # Determine supported languages from model config (assuming standard MMS config)
-    if hasattr(hf_model, 'config') and hasattr(hf_model.config, 'id2lang'):
-         hf_supported_langs = set(hf_model.config.id2lang.values())
-         logger.info(f"HF Model Supported Languages (ISO 639-3): {sorted(list(hf_supported_langs))}")
-    else:
-        logger.warning("Could not automatically determine supported languages from HF model config.")
-        # Add known languages manually if needed, or leave empty to skip validation
-        # hf_supported_langs = {'eng', 'spa', 'fra', ...}
-    logger.info("Hugging Face model and processor loaded successfully.")
-except Exception as e:
-    logger.error(f"FATAL: Failed to load Hugging Face model '{HF_MODEL_ID}': {e}", exc_info=True)
-    # Depending on deployment, you might want the app to fail startup
-    # Or allow it to run with only gTTS available
-    logger.warning("Proceeding without Hugging Face TTS functionality.")
-    hf_model = None # Ensure model is None if loading failed
 # --- FastAPI App Initialization ---
 app = FastAPI(
-    title="Multi TTS API",
-    description="API for Text-to-Speech using both gTTS and a Hugging Face MMS model.",
-    version="2.0.0",
 )
 # --- API Endpoints ---
 @app.get("/", tags=["General"])
 def read_root():
     """
-    Root endpoint providing a welcome message and available TTS engines.
     """
-    engines = ["gTTS (/tts/gtts)"]
-    if hf_model is not None:
-        engines.append(f"HuggingFace MMS ({HF_MODEL_ID}) (/tts/hf)")
-    return {
-        "message": "Welcome to the Multi TTS API.",
-        "available_engines": engines
-    }
-# --- gTTS Endpoint ---
-@app.get("/tts/gtts", tags=["TTS - gTTS"])
-def text_to_speech_gtts(
     text: str = Query(
-        ...,
         min_length=1,
-        max_length=500,
         title="Text to Convert",
-        description="The text to convert using Google Text-to-Speech."
     ),
     lang: str = Query(
-        "en",
         min_length=2,
-        max_length=10,
-        title="Language Code (BCP 47)",
-        description="The BCP 47 language code for gTTS (e.g., 'en', 'es', 'fr', 'zh-CN')."
     )
 ):
     """
-    Generates speech using Google Text-to-Speech (gTTS).
-    Returns an MP3 audio file.
     """
-    logger.info(f"gTTS request received: lang='{lang}', text='{text[:50]}...'")
     try:
-        # Generate a unique MP3 filename
-        filename_mp3 = os.path.join(TEMP_DIR, f"gtts_{uuid.uuid4().hex}.mp3")
         tts_object = gTTS(text=text, lang=lang, slow=False)
-        tts_object.save(filename_mp3)
-        logger.info(f"gTTS generated audio file: {filename_mp3}")
         return FileResponse(
-            path=filename_mp3,
             media_type="audio/mpeg",
-            filename=f"gtts_speech_{lang}.mp3"
         )
     except gTTSError as e:
-        logger.error(f"gTTS Error: {e} (lang={lang})", exc_info=False) # Don't need full stack trace for common gTTS errors
-        raise HTTPException(status_code=400, detail=f"gTTS Error: {str(e)}. Ensure language '{lang}' is supported by gTTS.")
-    except Exception as e:
-        logger.error(f"Unexpected error in gTTS endpoint: {e}", exc_info=True)
-        raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
-# --- Hugging Face MMS Endpoint ---
-@app.get("/tts/hf", tags=["TTS - HuggingFace MMS"])
-def text_to_speech_hf(
-    text: str = Query(
-        ...,
-        min_length=1,
-        max_length=500, # MMS might have different limits, adjust if known
-        title="Text to Convert",
-        description="The text to convert using the Hugging Face MMS model."
-    ),
-    lang_code: str = Query(
-        "eng", # Default to English ISO code
-        min_length=3,
-        max_length=3,
-        title="Language Code (ISO 639-3)",
-        description=f"The ISO 639-3 language code for the MMS model (e.g., 'eng', 'spa', 'fra'). Supported: {sorted(list(hf_supported_langs)) if hf_supported_langs else 'Unknown - check logs/model card'}"
-    )
-):
-    """
-    Generates speech using the Hugging Face MMS model (`willwade/mms-tts-multilingual-models-onnx`).
-    Returns a WAV audio file.
-    """
-    logger.info(f"HF MMS request received: lang_code='{lang_code}', text='{text[:50]}...'")
-    if hf_model is None or hf_processor is None:
-        logger.warning("HF endpoint called, but model/processor not loaded.")
-        raise HTTPException(status_code=503, detail="Hugging Face TTS service is currently unavailable.")
-    # --- Language Validation ---
-    if hf_supported_langs and lang_code not in hf_supported_langs:
-         logger.warning(f"Unsupported language code '{lang_code}' requested for HF model.")
-         raise HTTPException(
-             status_code=400,
-             detail=f"Unsupported language code: '{lang_code}'. Supported codes (ISO 639-3): {sorted(list(hf_supported_langs))}"
-         )
-    try:
-        # --- Preprocessing ---
-        logger.debug("Preprocessing text with HF processor...")
-        # MMS models often don't need language specified in the processor if handled by speaker_id in generate
-        inputs = hf_processor(text, return_tensors="pt")
-        # --- Speaker ID / Language Selection for Generation ---
-        # The willwade model uses language codes directly mapped in its config
-        target_lang_id = None
-        if hasattr(hf_model, 'config') and hasattr(hf_model.config, 'lang_code_to_id'):
-            target_lang_id = hf_model.config.lang_code_to_id.get(lang_code)
-        if target_lang_id is None:
-             logger.error(f"Could not find target language ID for code '{lang_code}' in model config.")
-             # This check might be redundant if the initial validation passed, but good safeguard
-             raise HTTPException(status_code=500, detail=f"Internal configuration error: Cannot map language code '{lang_code}' to model ID.")
-        logger.debug(f"Generating speech with HF model for lang_code '{lang_code}' (ID: {target_lang_id})...")
-        # --- Generation (using torch.no_grad for inference efficiency) ---
-        with torch.no_grad():
-             # Use speaker_id to specify the target language for MMS models
-             # output_dict = hf_model.generate(**inputs, speaker_id=target_lang_id, return_dict=True) # Use this if output_attentions etc. needed
-             outputs = hf_model.generate(**inputs, speaker_id=target_lang_id)
-        # Extract waveform - adjust key/index based on actual model output structure if needed
-        # Typically the primary output tensor is the waveform
-        waveform = outputs[0].cpu().numpy().squeeze() # Get waveform, move to CPU, convert to numpy, remove batch dim if present
-        logger.debug(f"Generated waveform shape: {waveform.shape}")
-        if waveform.ndim != 1 or waveform.size == 0:
-             logger.error(f"Unexpected waveform shape or size: {waveform.shape}")
-             raise ValueError("Generated audio waveform is invalid.")
-        # Get sampling rate from model config
-        sampling_rate = hf_model.config.sampling_rate
-        logger.debug(f"Using sampling rate: {sampling_rate}")
-        # --- Save as WAV ---
-        filename_wav = os.path.join(TEMP_DIR, f"hf_mms_{uuid.uuid4().hex}.wav")
-        logger.info(f"Saving generated audio to WAV file: {filename_wav}")
-        # Using scipy to write WAV
-        # Ensure waveform is in the correct format (float32 or int16 typically)
-        # MMS models usually output float32 between -1.0 and 1.0
-        if waveform.dtype != 'float32':
-             logger.warning(f"Waveform dtype is {waveform.dtype}, converting to float32 for saving.")
-             waveform = waveform.astype('float32')
-        # Scale if necessary, but MMS usually outputs in [-1, 1] range suitable for float32 wav
-        scipy.io.wavfile.write(filename_wav, sampling_rate, waveform)
-        # # Alternative using soundfile (often more robust)
-        # sf.write(filename_wav, waveform, sampling_rate, subtype='FLOAT') # Use 'PCM_16' if int16 desired
-        return FileResponse(
-            path=filename_wav,
-            media_type="audio/wav",
-            filename=f"hf_mms_speech_{lang_code}.wav"
-        )
-    except ValueError as e: # Catch specific errors like invalid waveform
-         logger.error(f"Value error during HF TTS processing: {e}", exc_info=True)
-         raise HTTPException(status_code=400, detail=f"Input or Processing Error: {str(e)}")
     except Exception as e:
-        logger.error(f"Unexpected error in HF MMS endpoint: {e}", exc_info=True)
-        raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
-# --- How to Run ---
-# 1. Save this code as `app.py`.
-# 2. Create `requirements.txt` (as shown above).
-# 3. Install dependencies: `pip install -r requirements.txt`
-# 4. Run the FastAPI server: `uvicorn app:app --reload`
-#    (Use `--host 0.0.0.0` if running in Docker or need external access)
 #
-# --- Example Usage ---
-# - gTTS (English): http://127.0.0.1:8000/tts/gtts?text=Hello+from+gTTS&lang=en
-# - gTTS (Spanish): http://127.0.0.1:8000/tts/gtts?text=Hola+desde+gTTS&lang=es
 #
-# - HF MMS (English): http://127.0.0.1:8000/tts/hf?text=Hello+from+the+MMS+model&lang_code=eng
-# - HF MMS (Spanish): http://127.0.0.1:8000/tts/hf?text=Hola+desde+el+modelo+MMS&lang_code=spa
-# - HF MMS (French): http://127.0.0.1:8000/tts/hf?text=Bonjour+du+modèle+MMS&lang_code=fra
-#   (Check supported 'lang_code' values from server logs or model card)

+# main.py
 from fastapi import FastAPI, Query, HTTPException
+from fastapi.responses import FileResponse
+from gtts import gTTS, gTTSError  # Import gTTSError for specific error handling
+import uuid
+import os  # Import os module for path operations
+import tempfile # Import tempfile for better temporary file handling
+# --- Configuration ---
+# Use tempfile to get a cross-platform temporary directory
 TEMP_DIR = tempfile.gettempdir()
+# Ensure the temporary directory exists
 os.makedirs(TEMP_DIR, exist_ok=True)
 # --- FastAPI App Initialization ---
 app = FastAPI(
+    title="gTTS API",
+    description="A simple API to convert text to speech using Google Text-to-Speech (gTTS). Supports multiple languages including Tamil, Sinhala, and many others.",
+    version="1.2.0", # Increment version for documentation changes
 )
 # --- API Endpoints ---
 @app.get("/", tags=["General"])
 def read_root():
     """
+    Root endpoint providing a welcome message.
     """
+    return {"message": "Welcome to the gTTS API. Use the /tts endpoint to generate speech."}
+@app.get("/tts", tags=["Text-to-Speech"])
+def text_to_speech(
     text: str = Query(
+        ..., # Ellipsis makes the parameter required
         min_length=1,
+        max_length=500, # Adjust max length as needed
         title="Text to Convert",
+        description="The text you want to convert into speech (1-500 characters)."
     ),
     lang: str = Query(
+        "en", # Default language is English
         min_length=2,
+        max_length=10, # Allow for language codes like 'en-us', 'zh-CN' etc.
+        title="Language Code",
+        description="The BCP 47 language code for the speech synthesis (e.g., 'en', 'es', 'ta', 'si', 'ja', 'zh-CN'). See gTTS documentation for supported languages."
     )
 ):
     """
+    Converts the provided text into an MP3 audio file using the specified language.
+    - **text**: The text to synthesize (required).
+    - **lang**: The language code (e.g., 'en', 'es', 'fr', 'ta', 'si'). Defaults to 'en'. **Crucially, gTTS must support this language code.**
     """
     try:
+        # Generate a unique filename in the configured temporary directory
+        filename = os.path.join(TEMP_DIR, f"{uuid.uuid4().hex}.mp3")
+        # Create gTTS object with text and language
+        # Use slow=False for normal speed speech
         tts_object = gTTS(text=text, lang=lang, slow=False)
+        # Save the audio file
+        tts_object.save(filename)
+        # Return the audio file as a response
+        # The 'filename' parameter sets the download name for the browser
         return FileResponse(
+            path=filename,
             media_type="audio/mpeg",
+            filename=f"speech_{lang}.mp3" # Suggest a filename like speech_en.mp3 or speech_ta.mp3
+            # Consider adding background task for cleanup as mentioned in previous examples
         )
     except gTTSError as e:
+        # Handle specific gTTS errors (like invalid language code, network issues)
+        detail_message = f"gTTS Error: {str(e)}. Ensure the language code '{lang}' is supported and text is appropriate for the language."
+        # Check common error patterns
+        if "400 (Bad Request)" in str(e) or "Language not supported" in str(e):
+             raise HTTPException(status_code=400, detail=detail_message)
+        elif "500 (Internal Server Error)" in str(e) or "Failed to connect" in str(e):
+             # Treat these as potential temporary Google service issues
+             raise HTTPException(status_code=503, detail=f"Service Error: {str(e)}. Could be a temporary issue with the TTS service.")
+        else: # Other gTTS errors
+             raise HTTPException(status_code=503, detail=detail_message) # 503 Service Unavailable likely
+    except ValueError as e:
+        # Potentially handle other value errors if gTTS raises them for certain inputs
+         raise HTTPException(status_code=400, detail=f"Input Error: {str(e)}")
     except Exception as e:
+        # Catch any other unexpected errors
+        # Log the error for debugging
+        # import logging
+        # logging.exception(f"An unexpected error occurred during TTS generation for lang='{lang}'")
+        raise HTTPException(status_code=500, detail=f"Internal Server Error: An unexpected error occurred.")
+# --- How to Run (Instructions) ---
+# 1. Save this code as `main.py`.
+# 2. Install necessary libraries:
+#    pip install fastapi "uvicorn[standard]" gTTS
+# 3. Run the FastAPI server using Uvicorn:
+#    uvicorn main:app --reload
+#
+# --- How to Use - Examples ---
+# Open your browser or use a tool like curl/Postman.
+# Access the TTS endpoint with the 'text' and 'lang' query parameters.
+# NOTE: Text containing non-ASCII characters needs to be URL-encoded. Most browsers do this automatically.
+#
+# - English (en - Default):
+#   Text: "Hello, world!"
+#   URL: http://127.0.0.1:8000/tts?text=Hello%2C%20world%21
+#
+# - Spanish (es):
+#   Text: "Hola Mundo"
+#   URL: http://127.0.0.1:8000/tts?text=Hola%20Mundo&lang=es
+#
+# - French (fr):
+#   Text: "Bonjour le monde"
+#   URL: http://127.0.0.1:8000/tts?text=Bonjour%20le%20monde&lang=fr
+#
+# - German (de):
+#   Text: "Hallo Welt"
+#   URL: http://127.0.0.1:8000/tts?text=Hallo%20Welt&lang=de
+#
+# - Tamil (ta):
+#   Text: "வணக்கம் உலகம்"
+#   URL: http://127.0.0.1:8000/tts?text=%E0%AE%B5%E0%AE%A3%E0%AE%95%E0%AF%8D%E0%AE%95%E0%AE%AE%E0%AF%8D%20%E0%AE%89%E0%AE%B2%E0%AE%95%E0%AE%AE%E0%AF%8D&lang=ta
+#
+# - Sinhala (si):
+#   Text: "හෙලෝ ලෝකය"
+#   URL: http://127.0.0.1:8000/tts?text=%E0%B7%84%E0%B7%99%E0%B6%BD%E0%B7%9D%20%E0%B6%BD%E0%B7%9D%E0%B6%9A%E0%B6%BA&lang=si
+#
+# - Japanese (ja):
+#   Text: "こんにちは世界"
+#   URL: http://127.0.0.1:8000/tts?text=%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF%E4%B8%96%E7%95%8C&lang=ja
+#
+# - Chinese (Mandarin, Simplified) (zh-CN):
+#   Text: "你好世界"
+#   URL: http://127.0.0.1:8000/tts?text=%E4%BD%A0%E5%A5%BD%E4%B8%96%E7%95%8C&lang=zh-CN
+#
+# - Russian (ru):
+#   Text: "Привет мир"
+#   URL: http://127.0.0.1:8000/tts?text=%D0%9F%D1%80%D0%B8%D0%B2%D0%B5%D1%82%20%D0%BC%D0%B8%D1%80&lang=ru
+#
+# - Hindi (hi):
+#   Text: "नमस्ते दुनिया"
+#   URL: http://127.0.0.1:8000/tts?text=%E0%A4%A8%E0%A4%AE%E0%A4%B8%E0%A5%8D%E0%A4%A4%E0%A5%87%20%E0%A4%A6%E0%A5%81%E0%A4%A8%E0%A4%BF%E0%A4%AF%E0%A4%BE&lang=hi
 #
+# - Arabic (ar):
+#   Text: "مرحبا بالعالم"
+#   URL: http://127.0.0.1:8000/tts?text=%D9%85%D8%B1%D8%AD%D8%A8%D8%A7%20%D8%A8%D8%A7%D9%84%D8%B9%D8%A7%D9%84%D9%85&lang=ar
 #
+# Find more supported language codes in the gTTS documentation or common lists of BCP 47 codes.
+# The API will return an MP3 file download or playback depending on your browser/client.
+# If you provide an unsupported language code, you should get a 400 Bad Request error.