积极的屁孩 commited on
Commit
65b6204
·
1 Parent(s): fbb3473

add links and style ref text for vevo-tts

Browse files
Files changed (1) hide show
  1. app.py +39 -6
app.py CHANGED
@@ -525,7 +525,7 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
525
  traceback.print_exc()
526
  raise e
527
 
528
- def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language="en"):
529
  temp_ref_path = "wav/temp_ref.wav"
530
  temp_timbre_path = "wav/temp_timbre.wav"
531
  output_path = "wav/output_vevotts.wav"
@@ -560,6 +560,8 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
560
 
561
  # 打印debug信息
562
  print(f"Reference audio shape: {ref_tensor.shape}, sample rate: {ref_sr}")
 
 
563
 
564
  # 保存上传的音频
565
  torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
@@ -603,7 +605,7 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
603
  src_text=text,
604
  style_ref_wav_path=temp_ref_path,
605
  timbre_ref_wav_path=temp_timbre_path,
606
- style_ref_wav_text=None,
607
  src_text_language=src_language,
608
  style_ref_wav_text_language=ref_language,
609
  )
@@ -626,9 +628,39 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
626
  raise e
627
 
628
  # 创建Gradio界面
629
- with gr.Blocks(title="VEVO DEMO") as demo:
630
- gr.Markdown("# VEVO DEMO")
631
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
632
  with gr.Tab("Vevo-Timbre"):
633
  gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
634
  with gr.Row():
@@ -674,13 +706,14 @@ with gr.Blocks(title="VEVO DEMO") as demo:
674
  tts_reference = gr.Audio(label="Style Reference", type="numpy")
675
  tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
676
  tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Reference Audio Language", value="en")
 
677
  tts_button = gr.Button("Generate")
678
  with gr.Column():
679
  tts_output = gr.Audio(label="Result")
680
 
681
  tts_button.click(
682
  vevo_tts,
683
- inputs=[tts_text, tts_reference, tts_timbre_reference, tts_src_language, tts_ref_language],
684
  outputs=tts_output
685
  )
686
 
 
525
  traceback.print_exc()
526
  raise e
527
 
528
+ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en"):
529
  temp_ref_path = "wav/temp_ref.wav"
530
  temp_timbre_path = "wav/temp_timbre.wav"
531
  output_path = "wav/output_vevotts.wav"
 
560
 
561
  # 打印debug信息
562
  print(f"Reference audio shape: {ref_tensor.shape}, sample rate: {ref_sr}")
563
+ if style_ref_text:
564
+ print(f"Style reference text: {style_ref_text}")
565
 
566
  # 保存上传的音频
567
  torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
 
605
  src_text=text,
606
  style_ref_wav_path=temp_ref_path,
607
  timbre_ref_wav_path=temp_timbre_path,
608
+ style_ref_wav_text=style_ref_text,
609
  src_text_language=src_language,
610
  style_ref_wav_text_language=ref_language,
611
  )
 
628
  raise e
629
 
630
  # 创建Gradio界面
631
+ with gr.Blocks(title="Vevo DEMO") as demo:
632
+ gr.Markdown("# Vevo DEMO")
633
+ # 添加链接标签行
634
+ with gr.Row(elem_id="links_row"):
635
+ gr.HTML("""
636
+ <div style="display: flex; justify-content: center; gap: 10px; margin-bottom: 20px;">
637
+ <a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
638
+ <div style="background-color: #4a4a4a; color: white; padding: 5px 10px; border-radius: 5px; display: flex; align-items: center;">
639
+ <span style="background-color: #c44e52; padding: 5px 10px; border-radius: 0 5px 5px 0; margin-left: 5px;">Paper</span>
640
+ <span style="padding: 5px 10px;">arXiv</span>
641
+ </div>
642
+ </a>
643
+ <a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
644
+ <div style="background-color: #4a4a4a; color: white; padding: 5px 10px; border-radius: 5px; display: flex; align-items: center;">
645
+ <span style="background-color: #55a868; padding: 5px 10px; border-radius: 0 5px 5px 0; margin-left: 5px;">Paper</span>
646
+ <span style="padding: 5px 10px;">ICLR</span>
647
+ </div>
648
+ </a>
649
+ <a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
650
+ <div style="background-color: #4a4a4a; color: white; padding: 5px 10px; border-radius: 5px; display: flex; align-items: center;">
651
+ <span style="background-color: #eeca3b; padding: 5px 10px; border-radius: 0 5px 5px 0; margin-left: 5px;">Model</span>
652
+ <span style="padding: 5px 10px;">HuggingFace</span>
653
+ </div>
654
+ </a>
655
+ <a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
656
+ <div style="background-color: #4a4a4a; color: white; padding: 5px 10px; border-radius: 5px; display: flex; align-items: center;">
657
+ <span style="background-color: #4c72b0; padding: 5px 10px; border-radius: 0 5px 5px 0; margin-left: 5px;">Repo</span>
658
+ <span style="padding: 5px 10px;">GitHub</span>
659
+ </div>
660
+ </a>
661
+ </div>
662
+ """)
663
+
664
  with gr.Tab("Vevo-Timbre"):
665
  gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
666
  with gr.Row():
 
706
  tts_reference = gr.Audio(label="Style Reference", type="numpy")
707
  tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
708
  tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Reference Audio Language", value="en")
709
+ tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
710
  tts_button = gr.Button("Generate")
711
  with gr.Column():
712
  tts_output = gr.Audio(label="Result")
713
 
714
  tts_button.click(
715
  vevo_tts,
716
+ inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_ref_language],
717
  outputs=tts_output
718
  )
719