Spaces:
Build error
Build error
积极的屁孩
commited on
Commit
·
65b6204
1
Parent(s):
fbb3473
add links and style ref text for vevo-tts
Browse files
app.py
CHANGED
@@ -525,7 +525,7 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
|
|
525 |
traceback.print_exc()
|
526 |
raise e
|
527 |
|
528 |
-
def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language="en"):
|
529 |
temp_ref_path = "wav/temp_ref.wav"
|
530 |
temp_timbre_path = "wav/temp_timbre.wav"
|
531 |
output_path = "wav/output_vevotts.wav"
|
@@ -560,6 +560,8 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
|
|
560 |
|
561 |
# 打印debug信息
|
562 |
print(f"Reference audio shape: {ref_tensor.shape}, sample rate: {ref_sr}")
|
|
|
|
|
563 |
|
564 |
# 保存上传的音频
|
565 |
torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
|
@@ -603,7 +605,7 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
|
|
603 |
src_text=text,
|
604 |
style_ref_wav_path=temp_ref_path,
|
605 |
timbre_ref_wav_path=temp_timbre_path,
|
606 |
-
style_ref_wav_text=
|
607 |
src_text_language=src_language,
|
608 |
style_ref_wav_text_language=ref_language,
|
609 |
)
|
@@ -626,9 +628,39 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
|
|
626 |
raise e
|
627 |
|
628 |
# 创建Gradio界面
|
629 |
-
with gr.Blocks(title="
|
630 |
-
gr.Markdown("#
|
631 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
632 |
with gr.Tab("Vevo-Timbre"):
|
633 |
gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
|
634 |
with gr.Row():
|
@@ -674,13 +706,14 @@ with gr.Blocks(title="VEVO DEMO") as demo:
|
|
674 |
tts_reference = gr.Audio(label="Style Reference", type="numpy")
|
675 |
tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
676 |
tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Reference Audio Language", value="en")
|
|
|
677 |
tts_button = gr.Button("Generate")
|
678 |
with gr.Column():
|
679 |
tts_output = gr.Audio(label="Result")
|
680 |
|
681 |
tts_button.click(
|
682 |
vevo_tts,
|
683 |
-
inputs=[tts_text, tts_reference, tts_timbre_reference, tts_src_language, tts_ref_language],
|
684 |
outputs=tts_output
|
685 |
)
|
686 |
|
|
|
525 |
traceback.print_exc()
|
526 |
raise e
|
527 |
|
528 |
+
def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en"):
|
529 |
temp_ref_path = "wav/temp_ref.wav"
|
530 |
temp_timbre_path = "wav/temp_timbre.wav"
|
531 |
output_path = "wav/output_vevotts.wav"
|
|
|
560 |
|
561 |
# 打印debug信息
|
562 |
print(f"Reference audio shape: {ref_tensor.shape}, sample rate: {ref_sr}")
|
563 |
+
if style_ref_text:
|
564 |
+
print(f"Style reference text: {style_ref_text}")
|
565 |
|
566 |
# 保存上传的音频
|
567 |
torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
|
|
|
605 |
src_text=text,
|
606 |
style_ref_wav_path=temp_ref_path,
|
607 |
timbre_ref_wav_path=temp_timbre_path,
|
608 |
+
style_ref_wav_text=style_ref_text,
|
609 |
src_text_language=src_language,
|
610 |
style_ref_wav_text_language=ref_language,
|
611 |
)
|
|
|
628 |
raise e
|
629 |
|
630 |
# 创建Gradio界面
|
631 |
+
with gr.Blocks(title="Vevo DEMO") as demo:
|
632 |
+
gr.Markdown("# Vevo DEMO")
|
633 |
+
# 添加链接标签行
|
634 |
+
with gr.Row(elem_id="links_row"):
|
635 |
+
gr.HTML("""
|
636 |
+
<div style="display: flex; justify-content: center; gap: 10px; margin-bottom: 20px;">
|
637 |
+
<a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
|
638 |
+
<div style="background-color: #4a4a4a; color: white; padding: 5px 10px; border-radius: 5px; display: flex; align-items: center;">
|
639 |
+
<span style="background-color: #c44e52; padding: 5px 10px; border-radius: 0 5px 5px 0; margin-left: 5px;">Paper</span>
|
640 |
+
<span style="padding: 5px 10px;">arXiv</span>
|
641 |
+
</div>
|
642 |
+
</a>
|
643 |
+
<a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
|
644 |
+
<div style="background-color: #4a4a4a; color: white; padding: 5px 10px; border-radius: 5px; display: flex; align-items: center;">
|
645 |
+
<span style="background-color: #55a868; padding: 5px 10px; border-radius: 0 5px 5px 0; margin-left: 5px;">Paper</span>
|
646 |
+
<span style="padding: 5px 10px;">ICLR</span>
|
647 |
+
</div>
|
648 |
+
</a>
|
649 |
+
<a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
|
650 |
+
<div style="background-color: #4a4a4a; color: white; padding: 5px 10px; border-radius: 5px; display: flex; align-items: center;">
|
651 |
+
<span style="background-color: #eeca3b; padding: 5px 10px; border-radius: 0 5px 5px 0; margin-left: 5px;">Model</span>
|
652 |
+
<span style="padding: 5px 10px;">HuggingFace</span>
|
653 |
+
</div>
|
654 |
+
</a>
|
655 |
+
<a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
|
656 |
+
<div style="background-color: #4a4a4a; color: white; padding: 5px 10px; border-radius: 5px; display: flex; align-items: center;">
|
657 |
+
<span style="background-color: #4c72b0; padding: 5px 10px; border-radius: 0 5px 5px 0; margin-left: 5px;">Repo</span>
|
658 |
+
<span style="padding: 5px 10px;">GitHub</span>
|
659 |
+
</div>
|
660 |
+
</a>
|
661 |
+
</div>
|
662 |
+
""")
|
663 |
+
|
664 |
with gr.Tab("Vevo-Timbre"):
|
665 |
gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
|
666 |
with gr.Row():
|
|
|
706 |
tts_reference = gr.Audio(label="Style Reference", type="numpy")
|
707 |
tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
708 |
tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Reference Audio Language", value="en")
|
709 |
+
tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
|
710 |
tts_button = gr.Button("Generate")
|
711 |
with gr.Column():
|
712 |
tts_output = gr.Audio(label="Result")
|
713 |
|
714 |
tts_button.click(
|
715 |
vevo_tts,
|
716 |
+
inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_ref_language],
|
717 |
outputs=tts_output
|
718 |
)
|
719 |
|