Spaces:
Build error
Build error
# tts.py | |
import os | |
import torch | |
import torchaudio | |
import spaces | |
import numpy as np | |
from typing import AsyncGenerator, Generator, Optional, Protocol, Tuple, Union | |
from numpy.typing import NDArray | |
from tortoise.api import TextToSpeech | |
from tortoise.utils.audio import load_audio | |
# Create cache/output directory | |
os.makedirs("outputs", exist_ok=True) | |
# Create a global TTS model instance | |
tts_model = None | |
# Define TTSOptions for compatibility with FastRTC | |
class TortoiseOptions: | |
def __init__(self, voice_preset="random", voice_file_path=None): | |
self.voice_preset = voice_preset | |
self.voice_file_path = voice_file_path | |
# The main Tortoise TTS wrapper class implementing FastRTC's TTSModel protocol | |
class TortoiseTTSModel: | |
def __init__(self): | |
global tts_model | |
if tts_model is None: | |
self._initialize_model() | |
self.tts_model = tts_model | |
def _initialize_model(self): | |
global tts_model | |
print("Initializing Tortoise-TTS model...") | |
tts_model = TextToSpeech(use_deepspeed=torch.cuda.is_available()) | |
print(f"Model initialized. Using device: {next(tts_model.autoregressive.parameters()).device}") | |
def _generate_speech(self, text, options=None): | |
options = options or TortoiseOptions() | |
try: | |
# Process voice sample if provided | |
voice_samples = None | |
if options.voice_file_path and os.path.exists(options.voice_file_path): | |
print(f"Loading voice from {options.voice_file_path}") | |
voice_samples, _ = load_audio(options.voice_file_path, 22050) | |
voice_samples = [voice_samples] | |
voice_preset = None | |
else: | |
voice_preset = options.voice_preset | |
# Generate speech | |
print(f"Generating speech for text: {text[:50]}...") | |
gen = self.tts_model.tts_with_preset( | |
text, | |
voice_samples=voice_samples, | |
preset=voice_preset | |
) | |
# Return the audio data with sample rate | |
return 24000, gen.squeeze(0).cpu().numpy().astype(np.float32) | |
except Exception as e: | |
print(f"Error generating speech: {str(e)}") | |
raise | |
def tts(self, text: str, options: Optional[TortoiseOptions] = None) -> Tuple[int, NDArray[np.float32]]: | |
"""Generate speech audio from text in a single call""" | |
return self._generate_speech(text, options) | |
async def stream_tts(self, text: str, options: Optional[TortoiseOptions] = None) -> AsyncGenerator[Tuple[int, NDArray[np.float32]], None]: | |
"""Stream speech audio asynchronously in chunks""" | |
sample_rate, audio_array = self._generate_speech(text, options) | |
# Split audio into chunks for streaming | |
chunk_size = 4000 # Adjust chunk size as needed | |
for i in range(0, len(audio_array), chunk_size): | |
chunk = audio_array[i:i+chunk_size] | |
yield sample_rate, chunk | |
def stream_tts_sync(self, text: str, options: Optional[TortoiseOptions] = None) -> Generator[Tuple[int, NDArray[np.float32]], None, None]: | |
"""Stream speech audio synchronously in chunks""" | |
sample_rate, audio_array = self._generate_speech(text, options) | |
# Split audio into chunks for streaming | |
chunk_size = 4000 # Adjust chunk size as needed | |
for i in range(0, len(audio_array), chunk_size): | |
chunk = audio_array[i:i+chunk_size] | |
yield sample_rate, chunk | |
# Create a singleton instance for easy import | |
tortoise_tts = TortoiseTTSModel() | |
# Legacy function for backward compatibility | |
async def generate_speech(text, voice_preset="random", voice_file_path=None): | |
options = TortoiseOptions(voice_preset, voice_file_path) | |
sample_rate, audio_array = tortoise_tts.tts(text, options) | |
return f"outputs/tts_output_{hash(text) % 10000}.wav", (sample_rate, torch.from_numpy(audio_array)) |