tts-xtts2-multi / app.py
TaiYouWeb's picture
Update app.py
db3663c verified
raw
history blame
3.82 kB
import io
import os
import tempfile
from typing import List
import TTS.api
import torch
from pydub import AudioSegment
import gradio as gr # Gradio库
import config
device = "cuda" if torch.cuda.is_available() else "cpu"
models = {}
for id, model in config.models.items():
models[id] = TTS.api.TTS(model).to(device)
def synthesize_tts(
text: str = 'Hello, World!',
speaker_wavs: List[gr.File] = None,
speaker_idx: str = 'Ana Florence',
language: str = 'ja',
temperature: float = 0.65,
length_penalty: float = 1.0,
repetition_penalty: float = 2.0,
top_k: int = 50,
top_p: float = 0.8,
speed: float = 1.0,
enable_text_splitting: bool = True,
):
temp_files = []
try:
if speaker_wavs:
# Process each uploaded file
for speaker_wav in speaker_wavs:
speaker_wav_bytes = speaker_wav.read()
# Convert the uploaded audio file to a WAV format using pydub
try:
audio = AudioSegment.from_file(io.BytesIO(speaker_wav_bytes))
wav_buffer = io.BytesIO()
audio.export(wav_buffer, format="wav")
wav_buffer.seek(0) # Reset buffer position to the beginning
except Exception as e:
return f"Error processing audio file: {e}"
temp_wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
temp_wav_file.write(wav_buffer.read())
temp_wav_file.close()
temp_files.append(temp_wav_file.name)
output_buffer = io.BytesIO()
if temp_files:
models['multi'].tts_to_file(
text=text,
speaker_wav=temp_files,
language=language,
file_path=output_buffer,
temperature=temperature,
length_penalty=length_penalty,
repetition_penalty=repetition_penalty,
top_k=top_k,
top_p=top_p,
speed=speed,
enable_text_splitting=enable_text_splitting
)
else:
models['multi'].tts_to_file(
text=text,
speaker=speaker_idx,
language=language,
file_path=output_buffer,
temperature=temperature,
length_penalty=length_penalty,
repetition_penalty=repetition_penalty,
top_k=top_k,
top_p=top_p,
speed=speed,
enable_text_splitting=enable_text_splitting
)
output_buffer.seek(0)
return output_buffer.read()
finally:
for temp_file in temp_files:
if isinstance(temp_file, str) and os.path.exists(temp_file):
os.remove(temp_file)
# 创建Gradio界面
inputs = [
gr.Textbox(value="Hello, World!", label="Text to Synthesize"),
gr.File(file_types=["audio"], label="Speaker WAV files (optional)", optional=True, multiple=True),
gr.Textbox(value="Ana Florence", label="Speaker Index"),
gr.Textbox(value="ja", label="Language"),
gr.Slider(0, 1, value=0.65, step=0.01, label="Temperature"),
gr.Slider(0.5, 2, value=1.0, step=0.1, label="Length Penalty"),
gr.Slider(1, 10, value=2.0, step=0.1, label="Repetition Penalty"),
gr.Slider(1, 100, value=50, step=1, label="Top-K"),
gr.Slider(0, 1, value=0.8, step=0.01, label="Top-P"),
gr.Slider(0.5, 2, value=1.0, step=0.01, label="Speed"),
gr.Checkbox(value=True, label="Enable Text Splitting")
]
outputs = gr.Audio(label="Generated Speech")
gr.Interface(
fn=synthesize_tts,
inputs=inputs,
outputs=outputs,
title="Text-to-Speech Synthesis with Gradio"
).launch()