Spaces:

Athspi-aitools
/

Aittsg

Running

App Files Files Community

Athspi commited on Apr 8

Commit

6dde081

verified ·

1 Parent(s): 26cfa8b

Create app.py

Browse files

Files changed (1) hide show

app.py +232 -0

app.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import os
+import uuid
+import time
+from pathlib import Path
+import io
+import logging
+import torch
+from transformers import pipeline
+import soundfile as sf
+import numpy as np
+from fastapi import FastAPI, HTTPException, Body, BackgroundTasks
+from fastapi.responses import StreamingResponse # To send binary audio data
+from pydantic import BaseModel
+# --- Configuration ---
+# Choose a TTS model from the Hugging Face Hub
+MODEL_NAME = "espnet/kan-bayashi_ljspeech_vits" # Example model
+# MODEL_NAME = "suno/bark-small"
+# Directories
+BASE_DIR = Path(__file__).parent
+TEMP_AUDIO_DIR = BASE_DIR / "temp_audio" # For temporary storage before sending
+# Ensure temporary audio directory exists
+TEMP_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
+# Configure Logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# --- Pydantic Model for Request Body ---
+class TTSRequest(BaseModel):
+    text: str
+# --- Load TTS Model (Load on startup) ---
+logger.info("Attempting to load TTS model...")
+start_load_time = time.time()
+tts_pipeline = None
+try:
+    # Use GPU if available
+    if torch.cuda.is_available():
+        device = "cuda"
+    # Check for MPS (Apple Silicon) support if not CUDA
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    else:
+        device = "cpu"
+    logger.info(f"Using device: {device}")
+    tts_pipeline = pipeline("text-to-speech", model=MODEL_NAME, device=device)
+    logger.info(f"Model '{MODEL_NAME}' loaded successfully in {time.time() - start_load_time:.2f} seconds.")
+except Exception as e:
+    logger.error(f"FATAL: Could not load TTS model '{MODEL_NAME}'. Error: {e}", exc_info=True)
+    # The application can still run, but the /api/tts endpoint will fail until the model is loaded/fixed.
+# --- Initialize FastAPI App ---
+app = FastAPI(
+    title="Text-to-Speech API Service",
+    description=f"Provides a text-to-speech endpoint using the {MODEL_NAME} model. Send text, receive WAV audio.",
+    version="1.0.0"
+)
+# --- Background Task for Cleanup ---
+def cleanup_temp_file(filepath: Path):
+    """Removes a file in the background."""
+    try:
+        if filepath.exists():
+            os.remove(filepath)
+            logger.info(f"Cleaned up temp file: {filepath.name}")
+    except OSError as e:
+        logger.error(f"Error deleting temp file {filepath.name}: {e}")
+# --- API Endpoint for Text-to-Speech ---
+@app.post(
+    "/api/tts",
+    tags=["TTS"],
+    summary="Generate Speech from Text",
+    description="""Send a JSON object with a "text" field.
+    Returns the generated speech as a WAV audio file stream.""",
+    responses={
+        200: {
+            "content": {"audio/wav": {}},
+            "description": "Successful response returning the WAV audio stream.",
+        },
+        400: {"description": "Bad Request (e.g., empty text)"},
+        500: {"description": "Internal Server Error (e.g., model error)"},
+        503: {"description": "Service Unavailable (e.g., model not loaded)"},
+    },
+)
+async def generate_speech_api(
+    background_tasks: BackgroundTasks,
+    tts_request: TTSRequest = Body(...)
+):
+    """
+    Receives text via POST request and returns the generated WAV audio directly.
+    """
+    if tts_pipeline is None:
+        raise HTTPException(status_code=503, detail="TTS Model is not available or failed to load.")
+    text = tts_request.text
+    if not text or not text.strip():
+        raise HTTPException(status_code=400, detail="Input text cannot be empty.")
+    logger.info(f"Received API request to synthesize: '{text[:50]}...'") # Log truncated text
+    start_synth_time = time.time()
+    try:
+        # --- Generate Audio ---
+        with torch.no_grad(): # Good practice for inference
+             output = tts_pipeline(text)
+        audio_data = output.get("audio")
+        sampling_rate = output.get("sampling_rate")
+        if audio_data is None or sampling_rate is None:
+             logger.error("TTS pipeline output missing 'audio' or 'sampling_rate'.")
+             raise ValueError("Invalid output from TTS pipeline.")
+        # Ensure NumPy array
+        if isinstance(audio_data, torch.Tensor):
+            # Ensure it's on CPU before converting to numpy
+            audio_data = audio_data.cpu().numpy()
+        if not isinstance(audio_data, np.ndarray):
+             logger.error(f"Unexpected audio data type: {type(audio_data)}")
+             raise TypeError(f"Expected audio data as NumPy array, got {type(audio_data)}")
+        # Normalize if float and outside [-1, 1] range (important for WAV)
+        if np.issubdtype(audio_data.dtype, np.floating):
+             max_val = np.max(np.abs(audio_data))
+             if max_val > 1.0:
+                 audio_data = audio_data / max_val
+             # Convert to 16-bit integer format for standard WAV
+             audio_data = (audio_data * 32767).astype(np.int16)
+        elif not np.issubdtype(audio_data.dtype, np.integer):
+             logger.warning(f"Audio data is not float or int: {audio_data.dtype}. Attempting conversion to int16.")
+             # Attempt conversion if possible, might need adjustment based on model output
+             audio_data = audio_data.astype(np.int16)
+        synthesis_time = time.time() - start_synth_time
+        logger.info(f"Audio generated in {synthesis_time:.2f} seconds.")
+        # --- Prepare Audio for Streaming ---
+        # Method 1: Save to temp file and stream it (often safer for large files)
+        filename = f"speech_{uuid.uuid4()}.wav"
+        filepath = TEMP_AUDIO_DIR / filename
+        sf.write(filepath, audio_data, sampling_rate, subtype='PCM_16') # Save as standard 16-bit WAV
+        logger.info(f"Temporary audio saved to: {filepath.name}")
+        # Schedule the cleanup task to run after the response is sent
+        background_tasks.add_task(cleanup_temp_file, filepath)
+        # Return the file directly as a streaming response
+        return FileResponse(
+            path=filepath,
+            media_type="audio/wav",
+            filename=filename # Suggests a filename to the client
+        )
+        # # Method 2: Stream directly from memory buffer (avoids disk I/O)
+        # buffer = io.BytesIO()
+        # sf.write(buffer, audio_data, sampling_rate, format='WAV', subtype='PCM_16')
+        # buffer.seek(0) # Reset buffer position to the beginning
+        # logger.info("Audio prepared in memory buffer.")
+        # return StreamingResponse(buffer, media_type="audio/wav")
+    except Exception as e:
+        logger.error(f"Error during speech generation or streaming: {e}", exc_info=True)
+        # Cleanup temp file if it was created before an error occurred during streaming prep
+        if 'filepath' in locals() and filepath.exists():
+             logger.info(f"Cleaning up temp file due to error: {filepath.name}")
+             os.remove(filepath)
+        raise HTTPException(status_code=500, detail=f"Failed to process speech request. Error: {str(e)}")
+# --- Health Check Endpoint (Good Practice) ---
+@app.get("/health", tags=["System"], summary="Check API Health")
+async def health_check():
+    """
+    Simple health check endpoint. Checks if the TTS model is loaded.
+    """
+    if tts_pipeline is None:
+        return {"status": "unhealthy", "reason": "TTS model is not loaded or failed to load."}
+    # Can add more checks here (e.g., disk space, dependencies)
+    return {"status": "ok", "model_loaded": MODEL_NAME}
+# --- Root Endpoint (Optional Information) ---
+@app.get("/", tags=["System"], summary="API Information")
+async def read_root():
+    """
+    Provides basic information about the API.
+    """
+    return {
+        "message": "Welcome to the Text-to-Speech API Service!",
+        "model_used": MODEL_NAME,
+        "tts_endpoint": "/api/tts",
+        "health_endpoint": "/health",
+        "documentation": "/docs" # Link to FastAPI auto-generated docs
+    }
+# --- Optional: Add cleanup for *old* files on startup (if using FileResponse) ---
+def cleanup_old_audio_files(max_age_seconds: int = 3600): # Clean files older than 1 hour
+    now = time.time()
+    count = 0
+    try:
+        for filename in os.listdir(TEMP_AUDIO_DIR):
+            filepath = TEMP_AUDIO_DIR / filename
+            if filepath.is_file() and filename.startswith("speech_") and filename.endswith(".wav"):
+                try:
+                    file_mod_time = os.path.getmtime(filepath)
+                    if (now - file_mod_time) > max_age_seconds:
+                        os.remove(filepath)
+                        logger.info(f"Startup cleanup: Removed old temp file {filename}")
+                        count += 1
+                except OSError as e:
+                    logger.warning(f"Startup cleanup: Error removing file {filename}: {e}")
+        if count > 0:
+            logger.info(f"Startup cleanup: Removed {count} old audio files.")
+    except Exception as e:
+        logger.error(f"Startup cleanup: Error during old file cleanup: {e}")
+# Run cleanup on startup
+cleanup_old_audio_files()
+# --- How to Run Locally (for testing) ---
+# if __name__ == "__main__":
+#     import uvicorn
+#     # Ensure temp_audio exists before starting
+#     TEMP_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
+#     cleanup_old_audio_files() # Run cleanup before starting server
+#     uvicorn.run("app:app", host="127.0.0.1", port=8000, reload=True) # Use reload=False for production testing