import io import os import tempfile from typing import List import TTS.api import torch from pydub import AudioSegment import gradio as gr # Gradio库 import config device = "cuda" if torch.cuda.is_available() else "cpu" models = {} for id, model in config.models.items(): models[id] = TTS.api.TTS(model).to(device) def synthesize_tts( text: str = 'Hello, World!', speaker_wavs: List[gr.File] = None, speaker_idx: str = 'Ana Florence', language: str = 'ja', temperature: float = 0.65, length_penalty: float = 1.0, repetition_penalty: float = 2.0, top_k: int = 50, top_p: float = 0.8, speed: float = 1.0, enable_text_splitting: bool = True, ): temp_files = [] try: if speaker_wavs: # Process each uploaded file for speaker_wav in speaker_wavs: speaker_wav_bytes = speaker_wav.read() # Convert the uploaded audio file to a WAV format using pydub try: audio = AudioSegment.from_file(io.BytesIO(speaker_wav_bytes)) wav_buffer = io.BytesIO() audio.export(wav_buffer, format="wav") wav_buffer.seek(0) # Reset buffer position to the beginning except Exception as e: return f"Error processing audio file: {e}" temp_wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) temp_wav_file.write(wav_buffer.read()) temp_wav_file.close() temp_files.append(temp_wav_file.name) output_buffer = io.BytesIO() if temp_files: models['multi'].tts_to_file( text=text, speaker_wav=temp_files, language=language, file_path=output_buffer, temperature=temperature, length_penalty=length_penalty, repetition_penalty=repetition_penalty, top_k=top_k, top_p=top_p, speed=speed, enable_text_splitting=enable_text_splitting ) else: models['multi'].tts_to_file( text=text, speaker=speaker_idx, language=language, file_path=output_buffer, temperature=temperature, length_penalty=length_penalty, repetition_penalty=repetition_penalty, top_k=top_k, top_p=top_p, speed=speed, enable_text_splitting=enable_text_splitting ) output_buffer.seek(0) return output_buffer.read() finally: for temp_file in temp_files: if isinstance(temp_file, str) and os.path.exists(temp_file): os.remove(temp_file) # 创建Gradio界面 inputs = [ gr.Textbox(value="Hello, World!", label="Text to Synthesize"), gr.File(file_types=["audio"], label="Speaker WAV files (optional)", optional=True, multiple=True), gr.Textbox(value="Ana Florence", label="Speaker Index"), gr.Textbox(value="ja", label="Language"), gr.Slider(0, 1, value=0.65, step=0.01, label="Temperature"), gr.Slider(0.5, 2, value=1.0, step=0.1, label="Length Penalty"), gr.Slider(1, 10, value=2.0, step=0.1, label="Repetition Penalty"), gr.Slider(1, 100, value=50, step=1, label="Top-K"), gr.Slider(0, 1, value=0.8, step=0.01, label="Top-P"), gr.Slider(0.5, 2, value=1.0, step=0.01, label="Speed"), gr.Checkbox(value=True, label="Enable Text Splitting") ] outputs = gr.Audio(label="Generated Speech") gr.Interface( fn=synthesize_tts, inputs=inputs, outputs=outputs, title="Text-to-Speech Synthesis with Gradio" ).launch()