Spaces:

RSHVR
/

Command_RTC

Build error

App Files Files Community

Command_RTC / app.py

RSHVR

Update app.py

f3c69f5 verified about 1 month ago

raw

history blame

6.83 kB

	import os
	import tempfile
	import gradio as gr
	import torch
	import torchaudio
	import spaces
	from huggingface_hub import snapshot_download
	from tortoise.api import TextToSpeech
	from tortoise.utils.audio import load_audio
	import numpy as np
	import uuid
	from pydub import AudioSegment

	# Create output directory if it doesn't exist
	os.makedirs("outputs", exist_ok=True)

	# Check for CUDA availability (this will show CPU due to Zero-GPU)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Initial device check: {device}")

	# Create a tensor to verify Zero-GPU is working
	zero = torch.Tensor([0])
	if torch.cuda.is_available():
	zero = zero.cuda()
	print(f"Zero tensor device: {zero.device}")

	# Initialize Tortoise TTS (will be loaded on demand with Zero-GPU)
	tts = None

	# Available preset voice options
	PRESET_VOICES = ["random", "angie", "daniel", "deniro", "emma", "freeman",
	"geralt", "halle", "jlaw", "lj", "mol", "myself", "pat",
	"snakes", "tim_reynolds", "tom", "train_atkins", "train_daws",
	"train_dotrice", "train_dreams", "train_empire", "train_grace",
	"train_kennard", "train_lescault", "train_mouse", "weaver", "william"]

	def process_audio_file(audio_file_path):
	"""Process uploaded audio file to ensure it meets Tortoise requirements"""
	# Load audio file
	audio = AudioSegment.from_file(audio_file_path)

	# Convert to WAV format if it's not already
	if not audio_file_path.lower().endswith('.wav'):
	temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
	audio.export(temp_wav.name, format="wav")
	audio_file_path = temp_wav.name

	# Resample to 22.05kHz which is what Tortoise expects
	y, sr = torchaudio.load(audio_file_path)
	if sr != 22050:
	resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)
	y = resampler(y)
	temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
	torchaudio.save(temp_file.name, y, 22050)
	audio_file_path = temp_file.name

	return audio_file_path

	@spaces.GPU
	def generate_tts_with_voice(text, voice_sample_path=None, preset_voice=None):
	"""Generate TTS audio using Tortoise with either a custom voice or preset"""
	global tts

	try:
	# Now that we're inside the @spaces.GPU decorated function, CUDA should be available
	print(f"GPU function device: {zero.device}")

	# Initialize TTS model if not already initialized
	if tts is None:
	tts = TextToSpeech(use_deepspeed=True if torch.cuda.is_available() else False)
	print("TTS model initialized")

	voice_samples = None

	if voice_sample_path:
	# Process the voice sample
	voice_sample_path = process_audio_file(voice_sample_path)
	voice_samples, _ = load_audio(voice_sample_path, 22050)
	voice_samples = [voice_samples]
	preset_voice = None
	elif preset_voice and preset_voice != "random":
	voice_samples = None
	else: # random voice
	voice_samples = None
	preset_voice = "random"

	# Generate the speech
	output_id = str(uuid.uuid4())[:8]
	output_path = f"outputs/tts_output_{output_id}.wav"

	gen = tts.tts_with_preset(
	text,
	voice_samples=voice_samples,
	preset=preset_voice
	)

	# Save the generated audio
	torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)

	return output_path, "Success: TTS generation completed."
	except Exception as e:
	return None, f"Error: {str(e)}"

	@spaces.GPU
	def tts_interface(text, audio_file, preset_voice, record_audio):
	"""Interface function for Gradio with GPU acceleration"""
	print(f"Processing with device: {zero.device}")

	voice_sample_path = None

	# Determine which voice input to use
	if record_audio is not None:
	# Use recorded audio
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
	temp_file.close()
	record_audio = (record_audio[0], 22050) # Ensure sample rate is 22050
	torchaudio.save(temp_file.name, torch.tensor(record_audio[0]).unsqueeze(0), record_audio[1])
	voice_sample_path = temp_file.name
	elif audio_file is not None:
	# Use uploaded audio file
	voice_sample_path = audio_file

	# If no custom voice is provided, use the preset
	if voice_sample_path is None and preset_voice == "":
	preset_voice = "random"

	# Generate TTS
	output_path, message = generate_tts_with_voice(text, voice_sample_path, preset_voice)

	if output_path:
	return output_path, message
	else:
	return None, message

	# Create Gradio interface
	with gr.Blocks(title="Tortoise TTS with Voice Cloning") as demo:
	gr.Markdown("# Tortoise Text-to-Speech with Voice Cloning")
	gr.Markdown("Enter text and either upload a voice sample, record your voice, or select a preset voice.")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text to speak",
	placeholder="Enter the text you want to convert to speech...",
	lines=5
	)
	preset_voice = gr.Dropdown(
	choices=[""] + PRESET_VOICES,
	label="Preset Voice (optional)",
	value=""
	)

	with gr.Column():
	gr.Markdown("### Voice Input Options")
	with gr.Tab("Upload Voice"):
	audio_file = gr.Audio(
	label="Upload Voice Sample (optional)",
	type="filepath"
	)
	with gr.Tab("Record Voice"):
	record_audio = gr.Audio(
	label="Record Your Voice (optional)",
	source="microphone"
	)

	generate_button = gr.Button("Generate Speech")

	with gr.Row():
	output_audio = gr.Audio(label="Generated Speech")
	output_message = gr.Textbox(label="Status")

	generate_button.click(
	fn=tts_interface,
	inputs=[text_input, audio_file, preset_voice, record_audio],
	outputs=[output_audio, output_message]
	)

	gr.Markdown("### About This App")
	gr.Markdown("""
	This app uses Tortoise-TTS to generate high-quality speech from text.

	You can:
	- Enter any text you want to be spoken
	- Upload or record a voice sample for voice cloning
	- Or select from pre-defined voice presets

	The app runs on Hugging Face Spaces with Zero-GPU optimization.
	""")

	if __name__ == "__main__":
	demo.launch()