Command_RTC / app.py
RSHVR's picture
Update app.py
f3c69f5 verified
raw
history blame
6.83 kB
import os
import tempfile
import gradio as gr
import torch
import torchaudio
import spaces
from huggingface_hub import snapshot_download
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio
import numpy as np
import uuid
from pydub import AudioSegment
# Create output directory if it doesn't exist
os.makedirs("outputs", exist_ok=True)
# Check for CUDA availability (this will show CPU due to Zero-GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Initial device check: {device}")
# Create a tensor to verify Zero-GPU is working
zero = torch.Tensor([0])
if torch.cuda.is_available():
zero = zero.cuda()
print(f"Zero tensor device: {zero.device}")
# Initialize Tortoise TTS (will be loaded on demand with Zero-GPU)
tts = None
# Available preset voice options
PRESET_VOICES = ["random", "angie", "daniel", "deniro", "emma", "freeman",
"geralt", "halle", "jlaw", "lj", "mol", "myself", "pat",
"snakes", "tim_reynolds", "tom", "train_atkins", "train_daws",
"train_dotrice", "train_dreams", "train_empire", "train_grace",
"train_kennard", "train_lescault", "train_mouse", "weaver", "william"]
def process_audio_file(audio_file_path):
"""Process uploaded audio file to ensure it meets Tortoise requirements"""
# Load audio file
audio = AudioSegment.from_file(audio_file_path)
# Convert to WAV format if it's not already
if not audio_file_path.lower().endswith('.wav'):
temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
audio.export(temp_wav.name, format="wav")
audio_file_path = temp_wav.name
# Resample to 22.05kHz which is what Tortoise expects
y, sr = torchaudio.load(audio_file_path)
if sr != 22050:
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)
y = resampler(y)
temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
torchaudio.save(temp_file.name, y, 22050)
audio_file_path = temp_file.name
return audio_file_path
@spaces.GPU
def generate_tts_with_voice(text, voice_sample_path=None, preset_voice=None):
"""Generate TTS audio using Tortoise with either a custom voice or preset"""
global tts
try:
# Now that we're inside the @spaces.GPU decorated function, CUDA should be available
print(f"GPU function device: {zero.device}")
# Initialize TTS model if not already initialized
if tts is None:
tts = TextToSpeech(use_deepspeed=True if torch.cuda.is_available() else False)
print("TTS model initialized")
voice_samples = None
if voice_sample_path:
# Process the voice sample
voice_sample_path = process_audio_file(voice_sample_path)
voice_samples, _ = load_audio(voice_sample_path, 22050)
voice_samples = [voice_samples]
preset_voice = None
elif preset_voice and preset_voice != "random":
voice_samples = None
else: # random voice
voice_samples = None
preset_voice = "random"
# Generate the speech
output_id = str(uuid.uuid4())[:8]
output_path = f"outputs/tts_output_{output_id}.wav"
gen = tts.tts_with_preset(
text,
voice_samples=voice_samples,
preset=preset_voice
)
# Save the generated audio
torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)
return output_path, "Success: TTS generation completed."
except Exception as e:
return None, f"Error: {str(e)}"
@spaces.GPU
def tts_interface(text, audio_file, preset_voice, record_audio):
"""Interface function for Gradio with GPU acceleration"""
print(f"Processing with device: {zero.device}")
voice_sample_path = None
# Determine which voice input to use
if record_audio is not None:
# Use recorded audio
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
temp_file.close()
record_audio = (record_audio[0], 22050) # Ensure sample rate is 22050
torchaudio.save(temp_file.name, torch.tensor(record_audio[0]).unsqueeze(0), record_audio[1])
voice_sample_path = temp_file.name
elif audio_file is not None:
# Use uploaded audio file
voice_sample_path = audio_file
# If no custom voice is provided, use the preset
if voice_sample_path is None and preset_voice == "":
preset_voice = "random"
# Generate TTS
output_path, message = generate_tts_with_voice(text, voice_sample_path, preset_voice)
if output_path:
return output_path, message
else:
return None, message
# Create Gradio interface
with gr.Blocks(title="Tortoise TTS with Voice Cloning") as demo:
gr.Markdown("# Tortoise Text-to-Speech with Voice Cloning")
gr.Markdown("Enter text and either upload a voice sample, record your voice, or select a preset voice.")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to speak",
placeholder="Enter the text you want to convert to speech...",
lines=5
)
preset_voice = gr.Dropdown(
choices=[""] + PRESET_VOICES,
label="Preset Voice (optional)",
value=""
)
with gr.Column():
gr.Markdown("### Voice Input Options")
with gr.Tab("Upload Voice"):
audio_file = gr.Audio(
label="Upload Voice Sample (optional)",
type="filepath"
)
with gr.Tab("Record Voice"):
record_audio = gr.Audio(
label="Record Your Voice (optional)",
source="microphone"
)
generate_button = gr.Button("Generate Speech")
with gr.Row():
output_audio = gr.Audio(label="Generated Speech")
output_message = gr.Textbox(label="Status")
generate_button.click(
fn=tts_interface,
inputs=[text_input, audio_file, preset_voice, record_audio],
outputs=[output_audio, output_message]
)
gr.Markdown("### About This App")
gr.Markdown("""
This app uses Tortoise-TTS to generate high-quality speech from text.
You can:
- Enter any text you want to be spoken
- Upload or record a voice sample for voice cloning
- Or select from pre-defined voice presets
The app runs on Hugging Face Spaces with Zero-GPU optimization.
""")
if __name__ == "__main__":
demo.launch()