积极的屁孩 commited on
Commit
cc7434e
·
1 Parent(s): 29b1e08

adjustments

Browse files
Files changed (1) hide show
  1. app.py +121 -93
app.py CHANGED
@@ -236,7 +236,7 @@ def vevo_style(content_wav, style_wav):
236
 
237
  # 检查并处理音频数据
238
  if content_wav is None or style_wav is None:
239
- raise ValueError("请上传音频文件")
240
 
241
  # 处理音频格式
242
  if isinstance(content_wav, tuple) and len(content_wav) == 2:
@@ -260,7 +260,7 @@ def vevo_style(content_wav, style_wav):
260
  # 归一化音量
261
  content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
262
  else:
263
- raise ValueError("内容音频格式不正确")
264
 
265
  if isinstance(style_wav, tuple) and len(style_wav) == 2:
266
  # 确保正确的顺序 (data, sample_rate)
@@ -272,11 +272,11 @@ def vevo_style(content_wav, style_wav):
272
  if style_tensor.ndim == 1:
273
  style_tensor = style_tensor.unsqueeze(0) # 添加通道维度
274
  else:
275
- raise ValueError("风格音频格式不正确")
276
 
277
  # 打印debug信息
278
- print(f"内容音频形状: {content_tensor.shape}, 采样率: {content_sr}")
279
- print(f"风格音频形状: {style_tensor.shape}, 采样率: {style_sr}")
280
 
281
  # 保存音频
282
  torchaudio.save(temp_content_path, content_tensor, content_sr)
@@ -296,17 +296,17 @@ def vevo_style(content_wav, style_wav):
296
 
297
  # 检查生成音频是否为数值异常
298
  if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
299
- print("警告:生成的音频包含NaNInf")
300
  gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
301
 
302
- print(f"生成音频形状: {gen_audio.shape}, 最大值: {torch.max(gen_audio)}, 最小值: {torch.min(gen_audio)}")
303
 
304
  # 保存生成的音频
305
  save_audio(gen_audio, output_path=output_path)
306
 
307
  return output_path
308
  except Exception as e:
309
- print(f"处理过程中出错: {e}")
310
  import traceback
311
  traceback.print_exc()
312
  raise e
@@ -318,7 +318,7 @@ def vevo_timbre(content_wav, reference_wav):
318
 
319
  # 检查并处理音频数据
320
  if content_wav is None or reference_wav is None:
321
- raise ValueError("请上传音频文件")
322
 
323
  # 处理内容音频格式
324
  if isinstance(content_wav, tuple) and len(content_wav) == 2:
@@ -342,7 +342,7 @@ def vevo_timbre(content_wav, reference_wav):
342
  # 归一化音量
343
  content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
344
  else:
345
- raise ValueError("内容音频格式不正确")
346
 
347
  # 处理参考音频格式
348
  if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
@@ -366,11 +366,11 @@ def vevo_timbre(content_wav, reference_wav):
366
  # 归一化音量
367
  reference_tensor = reference_tensor / (torch.max(torch.abs(reference_tensor)) + 1e-6) * 0.95
368
  else:
369
- raise ValueError("参考音频格式不正确")
370
 
371
  # 打印debug信息
372
- print(f"内容音频形状: {content_tensor.shape}, 采样率: {content_sr}")
373
- print(f"参考音频形状: {reference_tensor.shape}, 采样率: {reference_sr}")
374
 
375
  # 保存上传的音频
376
  torchaudio.save(temp_content_path, content_tensor, content_sr)
@@ -389,29 +389,30 @@ def vevo_timbre(content_wav, reference_wav):
389
 
390
  # 检查生成音频是否为数值异常
391
  if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
392
- print("警告:生成的音频包含NaNInf")
393
  gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
394
 
395
- print(f"生成音频形状: {gen_audio.shape}, 最大值: {torch.max(gen_audio)}, 最小值: {torch.min(gen_audio)}")
396
 
397
  # 保存生成的音频
398
  save_audio(gen_audio, output_path=output_path)
399
 
400
  return output_path
401
  except Exception as e:
402
- print(f"处理过程中出错: {e}")
403
  import traceback
404
  traceback.print_exc()
405
  raise e
406
 
407
- def vevo_voice(content_wav, reference_wav):
408
  temp_content_path = "wav/temp_content.wav"
409
- temp_reference_path = "wav/temp_reference.wav"
 
410
  output_path = "wav/output_vevovoice.wav"
411
 
412
  # 检查并处理音频数据
413
- if content_wav is None or reference_wav is None:
414
- raise ValueError("请上传音频文件")
415
 
416
  # 处理内容音频格式
417
  if isinstance(content_wav, tuple) and len(content_wav) == 2:
@@ -435,39 +436,65 @@ def vevo_voice(content_wav, reference_wav):
435
  # 归一化音量
436
  content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
437
  else:
438
- raise ValueError("内容音频格式不正确")
439
 
440
- # 处理参考音频格式
441
- if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
442
- if isinstance(reference_wav[0], np.ndarray):
443
- reference_data, reference_sr = reference_wav
444
  else:
445
- reference_sr, reference_data = reference_wav
446
 
447
  # 确保是单声道
448
- if len(reference_data.shape) > 1 and reference_data.shape[1] > 1:
449
- reference_data = np.mean(reference_data, axis=1)
450
 
451
  # 重采样到24kHz
452
- if reference_sr != 24000:
453
- reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
454
- reference_tensor = torchaudio.functional.resample(reference_tensor, reference_sr, 24000)
455
- reference_sr = 24000
456
  else:
457
- reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
458
 
459
  # 归一化音量
460
- reference_tensor = reference_tensor / (torch.max(torch.abs(reference_tensor)) + 1e-6) * 0.95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
  else:
462
- raise ValueError("参考音频格式不正确")
463
 
464
  # 打印debug信息
465
- print(f"内容音频形状: {content_tensor.shape}, 采样率: {content_sr}")
466
- print(f"参考音频形状: {reference_tensor.shape}, 采样率: {reference_sr}")
 
467
 
468
  # 保存上传的音频
469
  torchaudio.save(temp_content_path, content_tensor, content_sr)
470
- torchaudio.save(temp_reference_path, reference_tensor, reference_sr)
 
471
 
472
  try:
473
  # 获取管道
@@ -477,23 +504,23 @@ def vevo_voice(content_wav, reference_wav):
477
  gen_audio = pipeline.inference_ar_and_fm(
478
  src_wav_path=temp_content_path,
479
  src_text=None,
480
- style_ref_wav_path=temp_reference_path,
481
- timbre_ref_wav_path=temp_reference_path,
482
  )
483
 
484
  # 检查生成音频是否为数值异常
485
  if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
486
- print("警告:生成的音频包含NaNInf")
487
  gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
488
 
489
- print(f"生成音频形状: {gen_audio.shape}, 最大值: {torch.max(gen_audio)}, 最小值: {torch.min(gen_audio)}")
490
 
491
  # 保存生成的音频
492
  save_audio(gen_audio, output_path=output_path)
493
 
494
  return output_path
495
  except Exception as e:
496
- print(f"处理过程中出错: {e}")
497
  import traceback
498
  traceback.print_exc()
499
  raise e
@@ -505,7 +532,7 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
505
 
506
  # 检查并处理音频数据
507
  if ref_wav is None:
508
- raise ValueError("请上传参考音频文件")
509
 
510
  # 处理参考音频格式
511
  if isinstance(ref_wav, tuple) and len(ref_wav) == 2:
@@ -529,10 +556,10 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
529
  # 归一化音量
530
  ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
531
  else:
532
- raise ValueError("参考音频格式不正确")
533
 
534
  # 打印debug信息
535
- print(f"参考音频形状: {ref_tensor.shape}, 采样率: {ref_sr}")
536
 
537
  # 保存上传的音频
538
  torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
@@ -559,10 +586,10 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
559
  # 归一化音量
560
  timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
561
 
562
- print(f"音色参考音频形状: {timbre_tensor.shape}, 采样率: {timbre_sr}")
563
  torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
564
  else:
565
- raise ValueError("音色参考音频格式不正确")
566
  else:
567
  temp_timbre_path = temp_ref_path
568
 
@@ -583,74 +610,75 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
583
 
584
  # 检查生成音频是否为数值异常
585
  if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
586
- print("警告:生成的音频包含NaNInf")
587
  gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
588
 
589
- print(f"生成音频形状: {gen_audio.shape}, 最大值: {torch.max(gen_audio)}, 最小值: {torch.min(gen_audio)}")
590
 
591
  # 保存生成的音频
592
  save_audio(gen_audio, output_path=output_path)
593
 
594
  return output_path
595
  except Exception as e:
596
- print(f"处理过程中出错: {e}")
597
  import traceback
598
  traceback.print_exc()
599
  raise e
600
 
601
  # 创建Gradio界面
602
- with gr.Blocks(title="VEVO Demo") as demo:
603
- gr.Markdown("# VEVO: 多功能语音合成模型演示")
604
- gr.Markdown("## 可控零样本声音模仿与风格转换")
605
 
606
- with gr.Tab("风格转换 (Style)"):
607
- gr.Markdown("### Vevo-Style: 保持音色但转换风格(如口音、情感等)")
608
  with gr.Row():
609
  with gr.Column():
610
- style_content = gr.Audio(label="内容音频", type="numpy")
611
- style_reference = gr.Audio(label="风格音频", type="numpy")
612
- style_button = gr.Button("生成")
613
  with gr.Column():
614
- style_output = gr.Audio(label="生成结果")
615
- style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
616
 
617
- with gr.Tab("音色转换 (Timbre)"):
618
- gr.Markdown("### Vevo-Timbre: 保持风格但转换音色")
619
  with gr.Row():
620
  with gr.Column():
621
- timbre_content = gr.Audio(label="内容音频", type="numpy")
622
- timbre_reference = gr.Audio(label="音色参考音频", type="numpy")
623
- timbre_button = gr.Button("生成")
 
624
  with gr.Column():
625
- timbre_output = gr.Audio(label="生成结果")
626
- timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
627
 
628
- with gr.Tab("声音转换 (Voice)"):
629
- gr.Markdown("### Vevo-Voice: 同时转换风格和音色")
630
  with gr.Row():
631
  with gr.Column():
632
- voice_content = gr.Audio(label="内容音频", type="numpy")
633
- voice_reference = gr.Audio(label="声音参考音频", type="numpy")
634
- voice_button = gr.Button("生成")
635
  with gr.Column():
636
- voice_output = gr.Audio(label="生成结果")
637
- voice_button.click(vevo_voice, inputs=[voice_content, voice_reference], outputs=voice_output)
638
 
639
- with gr.Tab("文本到语音 (TTS)"):
640
- gr.Markdown("### Vevo-TTS: 风格与音色可控的文本到语音转换")
641
  with gr.Row():
642
  with gr.Column():
643
- tts_text = gr.Textbox(label="输入文本", placeholder="请输入要合成的文本...", lines=3)
644
- tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="文本语言", value="en")
645
- tts_reference = gr.Audio(label="风格参考音频", type="numpy")
646
- tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="参考音频语言", value="en")
647
 
648
- with gr.Accordion("高级选项", open=False):
649
- tts_timbre_reference = gr.Audio(label="音色参考音频(可选)", type="numpy")
650
 
651
- tts_button = gr.Button("生成")
652
  with gr.Column():
653
- tts_output = gr.Audio(label="生成结果")
654
 
655
  tts_button.click(
656
  vevo_tts,
@@ -659,14 +687,14 @@ with gr.Blocks(title="VEVO Demo") as demo:
659
  )
660
 
661
  gr.Markdown("""
662
- ## 关于VEVO
663
- VEVO是一个多功能语音合成和转换模型,提供四种主要功能:
664
- 1. **Vevo-Style**: 保持音色但转换风格(如口音、情感等)
665
- 2. **Vevo-Timbre**: 保持风格但转换音色
666
- 3. **Vevo-Voice**: 同时转换风格和音色
667
- 4. **Vevo-TTS**: 风格与音色可控的文本到语音转换
668
-
669
- 更多信息请访问[Amphion项目](https://github.com/open-mmlab/Amphion)
670
  """)
671
 
672
  # 启动应用
 
236
 
237
  # 检查并处理音频数据
238
  if content_wav is None or style_wav is None:
239
+ raise ValueError("Please upload audio files")
240
 
241
  # 处理音频格式
242
  if isinstance(content_wav, tuple) and len(content_wav) == 2:
 
260
  # 归一化音量
261
  content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
262
  else:
263
+ raise ValueError("Invalid content audio format")
264
 
265
  if isinstance(style_wav, tuple) and len(style_wav) == 2:
266
  # 确保正确的顺序 (data, sample_rate)
 
272
  if style_tensor.ndim == 1:
273
  style_tensor = style_tensor.unsqueeze(0) # 添加通道维度
274
  else:
275
+ raise ValueError("Invalid style audio format")
276
 
277
  # 打印debug信息
278
+ print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
279
+ print(f"Style audio shape: {style_tensor.shape}, sample rate: {style_sr}")
280
 
281
  # 保存音频
282
  torchaudio.save(temp_content_path, content_tensor, content_sr)
 
296
 
297
  # 检查生成音频是否为数值异常
298
  if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
299
+ print("Warning: Generated audio contains NaN or Inf values")
300
  gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
301
 
302
+ print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
303
 
304
  # 保存生成的音频
305
  save_audio(gen_audio, output_path=output_path)
306
 
307
  return output_path
308
  except Exception as e:
309
+ print(f"Error during processing: {e}")
310
  import traceback
311
  traceback.print_exc()
312
  raise e
 
318
 
319
  # 检查并处理音频数据
320
  if content_wav is None or reference_wav is None:
321
+ raise ValueError("Please upload audio files")
322
 
323
  # 处理内容音频格式
324
  if isinstance(content_wav, tuple) and len(content_wav) == 2:
 
342
  # 归一化音量
343
  content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
344
  else:
345
+ raise ValueError("Invalid content audio format")
346
 
347
  # 处理参考音频格式
348
  if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
 
366
  # 归一化音量
367
  reference_tensor = reference_tensor / (torch.max(torch.abs(reference_tensor)) + 1e-6) * 0.95
368
  else:
369
+ raise ValueError("Invalid reference audio format")
370
 
371
  # 打印debug信息
372
+ print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
373
+ print(f"Reference audio shape: {reference_tensor.shape}, sample rate: {reference_sr}")
374
 
375
  # 保存上传的音频
376
  torchaudio.save(temp_content_path, content_tensor, content_sr)
 
389
 
390
  # 检查生成音频是否为数值异常
391
  if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
392
+ print("Warning: Generated audio contains NaN or Inf values")
393
  gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
394
 
395
+ print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
396
 
397
  # 保存生成的音频
398
  save_audio(gen_audio, output_path=output_path)
399
 
400
  return output_path
401
  except Exception as e:
402
+ print(f"Error during processing: {e}")
403
  import traceback
404
  traceback.print_exc()
405
  raise e
406
 
407
+ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
408
  temp_content_path = "wav/temp_content.wav"
409
+ temp_style_path = "wav/temp_style.wav"
410
+ temp_timbre_path = "wav/temp_timbre.wav"
411
  output_path = "wav/output_vevovoice.wav"
412
 
413
  # 检查并处理音频数据
414
+ if content_wav is None or style_reference_wav is None or timbre_reference_wav is None:
415
+ raise ValueError("Please upload all required audio files")
416
 
417
  # 处理内容音频格式
418
  if isinstance(content_wav, tuple) and len(content_wav) == 2:
 
436
  # 归一化音量
437
  content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
438
  else:
439
+ raise ValueError("Invalid content audio format")
440
 
441
+ # 处理风格参考音频格式
442
+ if isinstance(style_reference_wav, tuple) and len(style_reference_wav) == 2:
443
+ if isinstance(style_reference_wav[0], np.ndarray):
444
+ style_data, style_sr = style_reference_wav
445
  else:
446
+ style_sr, style_data = style_reference_wav
447
 
448
  # 确保是单声道
449
+ if len(style_data.shape) > 1 and style_data.shape[1] > 1:
450
+ style_data = np.mean(style_data, axis=1)
451
 
452
  # 重采样到24kHz
453
+ if style_sr != 24000:
454
+ style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
455
+ style_tensor = torchaudio.functional.resample(style_tensor, style_sr, 24000)
456
+ style_sr = 24000
457
  else:
458
+ style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
459
 
460
  # 归一化音量
461
+ style_tensor = style_tensor / (torch.max(torch.abs(style_tensor)) + 1e-6) * 0.95
462
+ else:
463
+ raise ValueError("Invalid style reference audio format")
464
+
465
+ # 处理音色参考音频格式
466
+ if isinstance(timbre_reference_wav, tuple) and len(timbre_reference_wav) == 2:
467
+ if isinstance(timbre_reference_wav[0], np.ndarray):
468
+ timbre_data, timbre_sr = timbre_reference_wav
469
+ else:
470
+ timbre_sr, timbre_data = timbre_reference_wav
471
+
472
+ # 确保是单声道
473
+ if len(timbre_data.shape) > 1 and timbre_data.shape[1] > 1:
474
+ timbre_data = np.mean(timbre_data, axis=1)
475
+
476
+ # 重采样到24kHz
477
+ if timbre_sr != 24000:
478
+ timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
479
+ timbre_tensor = torchaudio.functional.resample(timbre_tensor, timbre_sr, 24000)
480
+ timbre_sr = 24000
481
+ else:
482
+ timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
483
+
484
+ # 归一化音量
485
+ timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
486
  else:
487
+ raise ValueError("Invalid timbre reference audio format")
488
 
489
  # 打印debug信息
490
+ print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
491
+ print(f"Style reference audio shape: {style_tensor.shape}, sample rate: {style_sr}")
492
+ print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
493
 
494
  # 保存上传的音频
495
  torchaudio.save(temp_content_path, content_tensor, content_sr)
496
+ torchaudio.save(temp_style_path, style_tensor, style_sr)
497
+ torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
498
 
499
  try:
500
  # 获取管道
 
504
  gen_audio = pipeline.inference_ar_and_fm(
505
  src_wav_path=temp_content_path,
506
  src_text=None,
507
+ style_ref_wav_path=temp_style_path,
508
+ timbre_ref_wav_path=temp_timbre_path,
509
  )
510
 
511
  # 检查生成音频是否为数值异常
512
  if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
513
+ print("Warning: Generated audio contains NaN or Inf values")
514
  gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
515
 
516
+ print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
517
 
518
  # 保存生成的音频
519
  save_audio(gen_audio, output_path=output_path)
520
 
521
  return output_path
522
  except Exception as e:
523
+ print(f"Error during processing: {e}")
524
  import traceback
525
  traceback.print_exc()
526
  raise e
 
532
 
533
  # 检查并处理音频数据
534
  if ref_wav is None:
535
+ raise ValueError("Please upload a reference audio file")
536
 
537
  # 处理参考音频格式
538
  if isinstance(ref_wav, tuple) and len(ref_wav) == 2:
 
556
  # 归一化音量
557
  ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
558
  else:
559
+ raise ValueError("Invalid reference audio format")
560
 
561
  # 打印debug信息
562
+ print(f"Reference audio shape: {ref_tensor.shape}, sample rate: {ref_sr}")
563
 
564
  # 保存上传的音频
565
  torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
 
586
  # 归一化音量
587
  timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
588
 
589
+ print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
590
  torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
591
  else:
592
+ raise ValueError("Invalid timbre reference audio format")
593
  else:
594
  temp_timbre_path = temp_ref_path
595
 
 
610
 
611
  # 检查生成音频是否为数值异常
612
  if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
613
+ print("Warning: Generated audio contains NaN or Inf values")
614
  gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
615
 
616
+ print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
617
 
618
  # 保存生成的音频
619
  save_audio(gen_audio, output_path=output_path)
620
 
621
  return output_path
622
  except Exception as e:
623
+ print(f"Error during processing: {e}")
624
  import traceback
625
  traceback.print_exc()
626
  raise e
627
 
628
  # 创建Gradio界面
629
+ with gr.Blocks(title="VEVO DEMO") as demo:
630
+ gr.Markdown("# VEVO DEMO")
631
+ gr.Markdown("## Controllable Zero-Shot Voice Conversion and Style Transfer")
632
 
633
+ with gr.Tab("Vevo-Timbre"):
634
+ gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
635
  with gr.Row():
636
  with gr.Column():
637
+ timbre_content = gr.Audio(label="Content Audio", type="numpy")
638
+ timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
639
+ timbre_button = gr.Button("Generate")
640
  with gr.Column():
641
+ timbre_output = gr.Audio(label="Result")
642
+ timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
643
 
644
+ with gr.Tab("Vevo-Voice"):
645
+ gr.Markdown("### Vevo-Voice: Transfer both style and timbre with separate references")
646
  with gr.Row():
647
  with gr.Column():
648
+ voice_content = gr.Audio(label="Content Audio", type="numpy")
649
+ voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
650
+ voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
651
+ voice_button = gr.Button("Generate")
652
  with gr.Column():
653
+ voice_output = gr.Audio(label="Result")
654
+ voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
655
 
656
+ with gr.Tab("Vevo-Style"):
657
+ gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
658
  with gr.Row():
659
  with gr.Column():
660
+ style_content = gr.Audio(label="Content Audio", type="numpy")
661
+ style_reference = gr.Audio(label="Style Reference", type="numpy")
662
+ style_button = gr.Button("Generate")
663
  with gr.Column():
664
+ style_output = gr.Audio(label="Result")
665
+ style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
666
 
667
+ with gr.Tab("Vevo-TTS"):
668
+ gr.Markdown("### Vevo-TTS: Text-to-speech with controllable style and timbre")
669
  with gr.Row():
670
  with gr.Column():
671
+ tts_text = gr.Textbox(label="Input Text", placeholder="Enter text to synthesize...", lines=3)
672
+ tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
673
+ tts_reference = gr.Audio(label="Style Reference", type="numpy")
674
+ tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Reference Audio Language", value="en")
675
 
676
+ with gr.Accordion("Advanced Options", open=False):
677
+ tts_timbre_reference = gr.Audio(label="Timbre Reference (Optional)", type="numpy")
678
 
679
+ tts_button = gr.Button("Generate")
680
  with gr.Column():
681
+ tts_output = gr.Audio(label="Result")
682
 
683
  tts_button.click(
684
  vevo_tts,
 
687
  )
688
 
689
  gr.Markdown("""
690
+ ## About VEVO
691
+ VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
692
+ 1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
693
+ 2. **Vevo-Timbre**: Maintains style but transfers timbre
694
+ 3. **Vevo-Voice**: Transfers both style and timbre simultaneously
695
+ 4. **Vevo-TTS**: Text-to-speech with controllable style and timbre
696
+
697
+ For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
698
  """)
699
 
700
  # 启动应用