Spaces:

RSHVR
/

Command_RTC

Build error

File size: 6,834 Bytes

import os
import tempfile
import gradio as gr
import torch
import torchaudio
import spaces
from huggingface_hub import snapshot_download
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio
import numpy as np
import uuid
from pydub import AudioSegment

# Create output directory if it doesn't exist
os.makedirs("outputs", exist_ok=True)

# Check for CUDA availability (this will show CPU due to Zero-GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Initial device check: {device}")

# Create a tensor to verify Zero-GPU is working
zero = torch.Tensor([0])
if torch.cuda.is_available():
    zero = zero.cuda()
    print(f"Zero tensor device: {zero.device}")

# Initialize Tortoise TTS (will be loaded on demand with Zero-GPU)
tts = None

# Available preset voice options
PRESET_VOICES = ["random", "angie", "daniel", "deniro", "emma", "freeman", 
                "geralt", "halle", "jlaw", "lj", "mol", "myself", "pat", 
                "snakes", "tim_reynolds", "tom", "train_atkins", "train_daws", 
                "train_dotrice", "train_dreams", "train_empire", "train_grace", 
                "train_kennard", "train_lescault", "train_mouse", "weaver", "william"]

def process_audio_file(audio_file_path):
    """Process uploaded audio file to ensure it meets Tortoise requirements"""
    # Load audio file
    audio = AudioSegment.from_file(audio_file_path)
    
    # Convert to WAV format if it's not already
    if not audio_file_path.lower().endswith('.wav'):
        temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
        audio.export(temp_wav.name, format="wav")
        audio_file_path = temp_wav.name
    
    # Resample to 22.05kHz which is what Tortoise expects
    y, sr = torchaudio.load(audio_file_path)
    if sr != 22050:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)
        y = resampler(y)
        temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
        torchaudio.save(temp_file.name, y, 22050)
        audio_file_path = temp_file.name
    
    return audio_file_path

@spaces.GPU
def generate_tts_with_voice(text, voice_sample_path=None, preset_voice=None):
    """Generate TTS audio using Tortoise with either a custom voice or preset"""
    global tts
    
    try:
        # Now that we're inside the @spaces.GPU decorated function, CUDA should be available
        print(f"GPU function device: {zero.device}")
        
        # Initialize TTS model if not already initialized
        if tts is None:
            tts = TextToSpeech(use_deepspeed=True if torch.cuda.is_available() else False)
            print("TTS model initialized")
        
        voice_samples = None
        
        if voice_sample_path:
            # Process the voice sample
            voice_sample_path = process_audio_file(voice_sample_path)
            voice_samples, _ = load_audio(voice_sample_path, 22050)
            voice_samples = [voice_samples]
            preset_voice = None
        elif preset_voice and preset_voice != "random":
            voice_samples = None
        else:  # random voice
            voice_samples = None
            preset_voice = "random"
        
        # Generate the speech
        output_id = str(uuid.uuid4())[:8]
        output_path = f"outputs/tts_output_{output_id}.wav"
        
        gen = tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            preset=preset_voice
        )
        
        # Save the generated audio
        torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)
        
        return output_path, "Success: TTS generation completed."
    except Exception as e:
        return None, f"Error: {str(e)}"

@spaces.GPU
def tts_interface(text, audio_file, preset_voice, record_audio):
    """Interface function for Gradio with GPU acceleration"""
    print(f"Processing with device: {zero.device}")
    
    voice_sample_path = None
    
    # Determine which voice input to use
    if record_audio is not None:
        # Use recorded audio
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
        temp_file.close()
        record_audio = (record_audio[0], 22050)  # Ensure sample rate is 22050
        torchaudio.save(temp_file.name, torch.tensor(record_audio[0]).unsqueeze(0), record_audio[1])
        voice_sample_path = temp_file.name
    elif audio_file is not None:
        # Use uploaded audio file
        voice_sample_path = audio_file
    
    # If no custom voice is provided, use the preset
    if voice_sample_path is None and preset_voice == "":
        preset_voice = "random"
    
    # Generate TTS
    output_path, message = generate_tts_with_voice(text, voice_sample_path, preset_voice)
    
    if output_path:
        return output_path, message
    else:
        return None, message

# Create Gradio interface
with gr.Blocks(title="Tortoise TTS with Voice Cloning") as demo:
    gr.Markdown("# Tortoise Text-to-Speech with Voice Cloning")
    gr.Markdown("Enter text and either upload a voice sample, record your voice, or select a preset voice.")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Text to speak",
                placeholder="Enter the text you want to convert to speech...",
                lines=5
            )
            preset_voice = gr.Dropdown(
                choices=[""] + PRESET_VOICES,
                label="Preset Voice (optional)",
                value=""
            )
            
        with gr.Column():
            gr.Markdown("### Voice Input Options")
            with gr.Tab("Upload Voice"):
                audio_file = gr.Audio(
                    label="Upload Voice Sample (optional)",
                    type="filepath"
                )
            with gr.Tab("Record Voice"):
                record_audio = gr.Audio(
                    label="Record Your Voice (optional)",
                    source="microphone"
                )
    
    generate_button = gr.Button("Generate Speech")
    
    with gr.Row():
        output_audio = gr.Audio(label="Generated Speech")
        output_message = gr.Textbox(label="Status")
    
    generate_button.click(
        fn=tts_interface,
        inputs=[text_input, audio_file, preset_voice, record_audio],
        outputs=[output_audio, output_message]
    )
    
    gr.Markdown("### About This App")
    gr.Markdown("""
    This app uses Tortoise-TTS to generate high-quality speech from text.
    
    You can:
    - Enter any text you want to be spoken
    - Upload or record a voice sample for voice cloning
    - Or select from pre-defined voice presets
    
    The app runs on Hugging Face Spaces with Zero-GPU optimization.
    """)

if __name__ == "__main__":
    demo.launch()