File size: 4,180 Bytes
5ca847f
 
 
db3663c
5ca847f
 
aa93b1b
5ca847f
 
db3663c
5ca847f
 
 
 
 
aa93b1b
 
 
 
 
 
 
 
 
5ca847f
 
aa93b1b
5ca847f
 
 
db3663c
 
 
 
 
 
 
 
 
 
 
 
 
5ca847f
 
 
 
 
5a12fa3
5ca847f
 
 
 
 
 
 
db3663c
5ca847f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db3663c
5ca847f
db3663c
 
5ca847f
 
 
db3663c
 
 
 
 
 
783ad44
db3663c
5a12fa3
db3663c
5a12fa3
 
db3663c
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import io
import os
import tempfile
from typing import List

import TTS.api
import TTS.utils.manage as manage
import torch
from pydub import AudioSegment
import gradio as gr  # Gradio库

import config

device = "cuda" if torch.cuda.is_available() else "cpu"

# 定义一个函数来自动接受许可条款
def ask_tos_patch(self, output_path):
    print("Automatically accepting the terms of service.")
    return True

# 使用我们定义的函数替换原有的 ask_tos 方法
manage.ModelManager.ask_tos = ask_tos_patch
tts = TTS.api.TTS()

models = {}
for id, model in config.models.items():
    tts.download_model_by_name(model)
    models[id] = TTS.api.TTS(model).to(device)


def synthesize_tts(
    text: str = 'Hello, World!',
    speaker_wavs: List[gr.File] = None,
    speaker_idx: str = 'Ana Florence',
    language: str = 'ja',
    temperature: float = 0.65,
    length_penalty: float = 1.0,
    repetition_penalty: float = 2.0,
    top_k: int = 50,
    top_p: float = 0.8,
    speed: float = 1.0,
    enable_text_splitting: bool = True,
):
    temp_files = []
    try:
        if speaker_wavs:
            # Process each uploaded file
            for speaker_wav in speaker_wavs:
                speaker_wav_bytes = speaker_wav.read()
                # Convert the uploaded audio file to a WAV format using pydub
                try:
                    audio = AudioSegment.from_file(io.BytesIO(speaker_wav_bytes))
                    wav_buffer = io.BytesIO()
                    audio.export(wav_buffer, format="wav")
                    wav_buffer.seek(0)  # Reset buffer position to the beginning
                except Exception as e:
                    return f"Error processing audio file: {e}"

                temp_wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
                temp_wav_file.write(wav_buffer.read())
                temp_wav_file.close()
                temp_files.append(temp_wav_file.name)

        output_buffer = io.BytesIO()
        if temp_files:
            models['multi'].tts_to_file(
                text=text,
                speaker_wav=temp_files, 
                language=language,
                file_path=output_buffer, 
                temperature=temperature,
                length_penalty=length_penalty,
                repetition_penalty=repetition_penalty,
                top_k=top_k,
                top_p=top_p,
                speed=speed,
                enable_text_splitting=enable_text_splitting
            )
        else:
            models['multi'].tts_to_file(
                text=text,
                speaker=speaker_idx, 
                language=language,
                file_path=output_buffer, 
                temperature=temperature,
                length_penalty=length_penalty,
                repetition_penalty=repetition_penalty,
                top_k=top_k,
                top_p=top_p,
                speed=speed,
                enable_text_splitting=enable_text_splitting
            )
        
        output_buffer.seek(0)
        return output_buffer.read()

    finally:
        for temp_file in temp_files:
            if isinstance(temp_file, str) and os.path.exists(temp_file):
                os.remove(temp_file)


# 创建Gradio界面
inputs = [
    gr.Textbox(value="Hello, World!", label="Text to Synthesize"),
    gr.File(file_types=["audio"], label="Speaker WAV files (optional)", file_count="multiple"),
    gr.Textbox(value="Ana Florence", label="Speaker Index"),
    gr.Textbox(value="en", label="Language"),
    gr.Slider(0, 1, value=0.65, step=0.01, label="Temperature"),
    gr.Slider(0.5, 2, value=1.0, step=0.1, label="Length Penalty"),
    gr.Slider(1.0, 10.0, value=2.0, step=0.1, label="Repetition Penalty"),
    gr.Slider(1, 100, value=50, step=1, label="Top-K"),
    gr.Slider(0, 1, value=0.8, step=0.01, label="Top-P"),
    gr.Slider(0.5, 2, value=1.0, step=0.01, label="Speed"),
    gr.Checkbox(value=True, label="Enable Text Splitting")
]

outputs = gr.Audio(label="Generated Speech")

gr.Interface(
    fn=synthesize_tts, 
    inputs=inputs, 
    outputs=outputs, 
    title="Text-to-Speech Synthesis with Gradio"
).launch()