积极的屁孩 commited on
Commit
c3e56e6
·
1 Parent(s): 5e1a778
Files changed (2) hide show
  1. app.py +378 -4
  2. requirements.txt +12 -0
app.py CHANGED
@@ -1,7 +1,381 @@
 
 
1
  import gradio as gr
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
  import gradio as gr
4
+ import torch
5
+ import tempfile
6
+ from pathlib import Path
7
+ from huggingface_hub import snapshot_download, hf_hub_download
8
 
9
+ # 添加模型目录到系统路径
10
+ sys.path.append(".")
11
 
12
+ # 导入Vevo工具类
13
+ from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio
14
+
15
+ # 模型配置常量
16
+ REPO_ID = "amphion/Vevo"
17
+ CACHE_DIR = "./ckpts/Vevo"
18
+
19
+ class VevoGradioApp:
20
+ def __init__(self):
21
+ # 设备设置
22
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
+ self.pipelines = {}
24
+ # 配置文件路径
25
+ self.config_paths = {
26
+ "vq32tovq8192": "./models/vc/vevo/config/Vq32ToVq8192.json",
27
+ "vq8192tomels": "./models/vc/vevo/config/Vq8192ToMels.json",
28
+ "phonetovq8192": "./models/vc/vevo/config/PhoneToVq8192.json",
29
+ "vocoder": "./models/vc/vevo/config/Vocoder.json"
30
+ }
31
+
32
+ # 确保配置文件存在
33
+ self.download_configs()
34
+
35
+ def download_configs(self):
36
+ """下载必要的配置文件"""
37
+ os.makedirs("./models/vc/vevo/config", exist_ok=True)
38
+ config_files = {
39
+ "Vq32ToVq8192.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vq32ToVq8192.json",
40
+ "Vq8192ToMels.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vq8192ToMels.json",
41
+ "PhoneToVq8192.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/PhoneToVq8192.json",
42
+ "Vocoder.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vocoder.json"
43
+ }
44
+
45
+ for filename, url in config_files.items():
46
+ target_path = f"./models/vc/vevo/config/{filename}"
47
+ if not os.path.exists(target_path):
48
+ try:
49
+ hf_hub_download(repo_id="Amphion/Vevo-configs", filename=filename, repo_type="dataset", local_dir="./models/vc/vevo/config/")
50
+ except:
51
+ # 如果从Hugging Face下载失败,创建一个占位符文件
52
+ with open(target_path, 'w') as f:
53
+ f.write('{}')
54
+ print(f"无法下载配置文件 {filename},已创建占位符。请手动添加配置。")
55
+
56
+ def init_voice_conversion_pipeline(self):
57
+ """初始化语音转换管道"""
58
+ if "voice" not in self.pipelines:
59
+ # 内容标记器
60
+ local_dir = snapshot_download(
61
+ repo_id=REPO_ID,
62
+ repo_type="model",
63
+ cache_dir=CACHE_DIR,
64
+ allow_patterns=["tokenizer/vq32/*"],
65
+ )
66
+ content_tokenizer_ckpt_path = os.path.join(
67
+ local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
68
+ )
69
+
70
+ # 内容-风格标记器
71
+ local_dir = snapshot_download(
72
+ repo_id=REPO_ID,
73
+ repo_type="model",
74
+ cache_dir=CACHE_DIR,
75
+ allow_patterns=["tokenizer/vq8192/*"],
76
+ )
77
+ content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
78
+
79
+ # 自回归变换器
80
+ local_dir = snapshot_download(
81
+ repo_id=REPO_ID,
82
+ repo_type="model",
83
+ cache_dir=CACHE_DIR,
84
+ allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
85
+ )
86
+ ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
87
+
88
+ # 流匹配变换器
89
+ local_dir = snapshot_download(
90
+ repo_id=REPO_ID,
91
+ repo_type="model",
92
+ cache_dir=CACHE_DIR,
93
+ allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
94
+ )
95
+ fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
96
+
97
+ # 声码器
98
+ local_dir = snapshot_download(
99
+ repo_id=REPO_ID,
100
+ repo_type="model",
101
+ cache_dir=CACHE_DIR,
102
+ allow_patterns=["acoustic_modeling/Vocoder/*"],
103
+ )
104
+ vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
105
+
106
+ # 创建推理管道
107
+ self.pipelines["voice"] = VevoInferencePipeline(
108
+ content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
109
+ content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
110
+ ar_cfg_path=self.config_paths["vq32tovq8192"],
111
+ ar_ckpt_path=ar_ckpt_path,
112
+ fmt_cfg_path=self.config_paths["vq8192tomels"],
113
+ fmt_ckpt_path=fmt_ckpt_path,
114
+ vocoder_cfg_path=self.config_paths["vocoder"],
115
+ vocoder_ckpt_path=vocoder_ckpt_path,
116
+ device=self.device,
117
+ )
118
+
119
+ return self.pipelines["voice"]
120
+
121
+ def init_timbre_pipeline(self):
122
+ """初始化音色转换管道"""
123
+ if "timbre" not in self.pipelines:
124
+ # 内容-风格标记器
125
+ local_dir = snapshot_download(
126
+ repo_id=REPO_ID,
127
+ repo_type="model",
128
+ cache_dir=CACHE_DIR,
129
+ allow_patterns=["tokenizer/vq8192/*"],
130
+ )
131
+ tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
132
+
133
+ # 流匹配变换器
134
+ local_dir = snapshot_download(
135
+ repo_id=REPO_ID,
136
+ repo_type="model",
137
+ cache_dir=CACHE_DIR,
138
+ allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
139
+ )
140
+ fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
141
+
142
+ # 声码器
143
+ local_dir = snapshot_download(
144
+ repo_id=REPO_ID,
145
+ repo_type="model",
146
+ cache_dir=CACHE_DIR,
147
+ allow_patterns=["acoustic_modeling/Vocoder/*"],
148
+ )
149
+ vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
150
+
151
+ # 创建推理管道
152
+ self.pipelines["timbre"] = VevoInferencePipeline(
153
+ content_style_tokenizer_ckpt_path=tokenizer_ckpt_path,
154
+ fmt_cfg_path=self.config_paths["vq8192tomels"],
155
+ fmt_ckpt_path=fmt_ckpt_path,
156
+ vocoder_cfg_path=self.config_paths["vocoder"],
157
+ vocoder_ckpt_path=vocoder_ckpt_path,
158
+ device=self.device,
159
+ )
160
+
161
+ return self.pipelines["timbre"]
162
+
163
+ def init_tts_pipeline(self):
164
+ """初始化文本转语音管道"""
165
+ if "tts" not in self.pipelines:
166
+ # 内容-风格标记器
167
+ local_dir = snapshot_download(
168
+ repo_id=REPO_ID,
169
+ repo_type="model",
170
+ cache_dir=CACHE_DIR,
171
+ allow_patterns=["tokenizer/vq8192/*"],
172
+ )
173
+ content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
174
+
175
+ # 自回归变换器
176
+ local_dir = snapshot_download(
177
+ repo_id=REPO_ID,
178
+ repo_type="model",
179
+ cache_dir=CACHE_DIR,
180
+ allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
181
+ )
182
+ ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
183
+
184
+ # 流匹配变换器
185
+ local_dir = snapshot_download(
186
+ repo_id=REPO_ID,
187
+ repo_type="model",
188
+ cache_dir=CACHE_DIR,
189
+ allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
190
+ )
191
+ fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
192
+
193
+ # 声码器
194
+ local_dir = snapshot_download(
195
+ repo_id=REPO_ID,
196
+ repo_type="model",
197
+ cache_dir=CACHE_DIR,
198
+ allow_patterns=["acoustic_modeling/Vocoder/*"],
199
+ )
200
+ vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
201
+
202
+ # 创建推理管道
203
+ self.pipelines["tts"] = VevoInferencePipeline(
204
+ content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
205
+ ar_cfg_path=self.config_paths["phonetovq8192"],
206
+ ar_ckpt_path=ar_ckpt_path,
207
+ fmt_cfg_path=self.config_paths["vq8192tomels"],
208
+ fmt_ckpt_path=fmt_ckpt_path,
209
+ vocoder_cfg_path=self.config_paths["vocoder"],
210
+ vocoder_ckpt_path=vocoder_ckpt_path,
211
+ device=self.device,
212
+ )
213
+
214
+ return self.pipelines["tts"]
215
+
216
+ def vevo_voice(self, content_audio, reference_audio):
217
+ """语音转换功能"""
218
+ pipeline = self.init_voice_conversion_pipeline()
219
+
220
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as content_file, \
221
+ tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as reference_file, \
222
+ tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
223
+
224
+ content_path = content_file.name
225
+ reference_path = reference_file.name
226
+ output_path = output_file.name
227
+
228
+ # 保存上传的音频文件
229
+ content_audio.save(content_path)
230
+ reference_audio.save(reference_path)
231
+
232
+ # 执行语音转换
233
+ gen_audio = pipeline.inference_ar_and_fm(
234
+ src_wav_path=content_path,
235
+ src_text=None,
236
+ style_ref_wav_path=reference_path,
237
+ timbre_ref_wav_path=reference_path,
238
+ )
239
+ save_audio(gen_audio, output_path=output_path)
240
+
241
+ return output_path
242
+
243
+ def vevo_style(self, content_audio, style_audio):
244
+ """风格转换功能"""
245
+ pipeline = self.init_voice_conversion_pipeline()
246
+
247
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as content_file, \
248
+ tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as style_file, \
249
+ tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
250
+
251
+ content_path = content_file.name
252
+ style_path = style_file.name
253
+ output_path = output_file.name
254
+
255
+ # 保存上传的音频文件
256
+ content_audio.save(content_path)
257
+ style_audio.save(style_path)
258
+
259
+ # 执行风格转换
260
+ gen_audio = pipeline.inference_ar_and_fm(
261
+ src_wav_path=content_path,
262
+ src_text=None,
263
+ style_ref_wav_path=style_path,
264
+ timbre_ref_wav_path=content_path,
265
+ )
266
+ save_audio(gen_audio, output_path=output_path)
267
+
268
+ return output_path
269
+
270
+ def vevo_timbre(self, content_audio, reference_audio):
271
+ """音色转换功能"""
272
+ pipeline = self.init_timbre_pipeline()
273
+
274
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as content_file, \
275
+ tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as reference_file, \
276
+ tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
277
+
278
+ content_path = content_file.name
279
+ reference_path = reference_file.name
280
+ output_path = output_file.name
281
+
282
+ # 保存上传的音频文件
283
+ content_audio.save(content_path)
284
+ reference_audio.save(reference_path)
285
+
286
+ # 执行音色转换
287
+ gen_audio = pipeline.inference_fm(
288
+ src_wav_path=content_path,
289
+ timbre_ref_wav_path=reference_path,
290
+ flow_matching_steps=32,
291
+ )
292
+ save_audio(gen_audio, output_path=output_path)
293
+
294
+ return output_path
295
+
296
+ def vevo_tts(self, text, ref_audio, src_language, ref_language, ref_text):
297
+ """文本转语音功能"""
298
+ pipeline = self.init_tts_pipeline()
299
+
300
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_file, \
301
+ tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
302
+
303
+ ref_path = ref_file.name
304
+ output_path = output_file.name
305
+
306
+ # 保存上传的音频文件
307
+ ref_audio.save(ref_path)
308
+
309
+ # 执行文本转语音
310
+ gen_audio = pipeline.inference_ar_and_fm(
311
+ src_wav_path=None,
312
+ src_text=text,
313
+ style_ref_wav_path=ref_path,
314
+ timbre_ref_wav_path=ref_path,
315
+ style_ref_wav_text=ref_text if ref_text else None,
316
+ src_text_language=src_language,
317
+ style_ref_wav_text_language=ref_language,
318
+ )
319
+ save_audio(gen_audio, output_path=output_path)
320
+
321
+ return output_path
322
+
323
+ def create_interface():
324
+ app = VevoGradioApp()
325
+
326
+ with gr.Blocks(title="Vevo 语音转换演示") as demo:
327
+ gr.Markdown("# Vevo 语音转换模型演示")
328
+ gr.Markdown("Vevo是一个强大的语音转换模型,支持语音转换、风格转换、音色转换和文本转语音功能。")
329
+
330
+ with gr.Tab("语音转换"):
331
+ gr.Markdown("## 语音转换 (VevoVoice)")
332
+ gr.Markdown("将内容音频的内容转换为参考音频的风格和音色。")
333
+ with gr.Row():
334
+ content_audio_voice = gr.Audio(label="内容音频", type="filepath")
335
+ reference_audio_voice = gr.Audio(label="参考音频", type="filepath")
336
+ voice_btn = gr.Button("转换")
337
+ voice_output = gr.Audio(label="转换结果")
338
+ voice_btn.click(fn=app.vevo_voice, inputs=[content_audio_voice, reference_audio_voice], outputs=voice_output)
339
+
340
+ with gr.Tab("风格转换"):
341
+ gr.Markdown("## 风格转换 (VevoStyle)")
342
+ gr.Markdown("将内容音频的风格转换为参考音频的风格,保留原始音色。")
343
+ with gr.Row():
344
+ content_audio_style = gr.Audio(label="内容音频", type="filepath")
345
+ style_audio = gr.Audio(label="风格参考音频", type="filepath")
346
+ style_btn = gr.Button("转换")
347
+ style_output = gr.Audio(label="转换结果")
348
+ style_btn.click(fn=app.vevo_style, inputs=[content_audio_style, style_audio], outputs=style_output)
349
+
350
+ with gr.Tab("音色转换"):
351
+ gr.Markdown("## 音色转换 (VevoTimbre)")
352
+ gr.Markdown("将内容音频的音色转换为参考音频的音色,保留内容和风格。")
353
+ with gr.Row():
354
+ content_audio_timbre = gr.Audio(label="内容音频", type="filepath")
355
+ reference_audio_timbre = gr.Audio(label="音色参考音频", type="filepath")
356
+ timbre_btn = gr.Button("转换")
357
+ timbre_output = gr.Audio(label="转换结果")
358
+ timbre_btn.click(fn=app.vevo_timbre, inputs=[content_audio_timbre, reference_audio_timbre], outputs=timbre_output)
359
+
360
+ with gr.Tab("文本转语音"):
361
+ gr.Markdown("## 文本转语音 (VevoTTS)")
362
+ gr.Markdown("将输入文本转换为语音,使用参考音频的风格和音色。")
363
+ text_input = gr.Textbox(label="输入文本", lines=3)
364
+ with gr.Row():
365
+ ref_audio_tts = gr.Audio(label="参考音频", type="filepath")
366
+ src_language = gr.Dropdown(["en", "zh", "ja", "ko"], label="源文本语言", value="en")
367
+ with gr.Row():
368
+ ref_language = gr.Dropdown(["en", "zh", "ja", "ko"], label="参考文本语言", value="en")
369
+ ref_text = gr.Textbox(label="参考文本(可选)", lines=2)
370
+ tts_btn = gr.Button("生成")
371
+ tts_output = gr.Audio(label="生成结果")
372
+ tts_btn.click(fn=app.vevo_tts, inputs=[text_input, ref_audio_tts, src_language, ref_language, ref_text], outputs=tts_output)
373
+
374
+ gr.Markdown("## 关于")
375
+ gr.Markdown("本演示基于 [Vevo模型](https://huggingface.co/amphion/Vevo),由[Amphion](https://github.com/open-mmlab/Amphion)开发。")
376
+
377
+ return demo
378
+
379
+ if __name__ == "__main__":
380
+ demo = create_interface()
381
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.14.0
2
+ huggingface_hub>=0.20.0
3
+ torch>=2.0.0
4
+ torchaudio>=2.0.0
5
+ numpy>=1.23.0
6
+ librosa>=0.10.0
7
+ accelerate>=0.21.0
8
+ PySoundFile>=0.9.0
9
+ safetensors>=0.4.0
10
+ yaml>=0.2.5
11
+ whisper>=1.1.10
12
+ IPython>=8.0.0