Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,096 Bytes
5b935e1 75aba2d 9165825 78f4d94 5b935e1 2c9c807 a6f6f36 75aba2d 9165825 5763d4c 9165825 5b935e1 78f4d94 5763d4c 9165825 eb54e69 5763d4c 9165825 5763d4c 5b935e1 5763d4c 9165825 5763d4c 9165825 5763d4c 9165825 5763d4c 9165825 5763d4c eb54e69 9165825 5b935e1 9165825 7e200c7 9165825 5763d4c 9165825 5b935e1 5763d4c 5b935e1 2761ffc 5b935e1 9165825 eb54e69 5763d4c 9165825 5763d4c 9165825 5b935e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import gradio as gr
import subprocess
import os
import sys
import soundfile as sf
import numpy as np
import torch
import traceback
import spaces
repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi"
repo_dir = "StyleTTS2-lite-vi"
if not os.path.exists(repo_dir):
subprocess.run(["git", "clone", repo_url, repo_dir])
sys.path.append(os.path.abspath(repo_dir))
from inference import StyleTTS2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config_path = os.path.join(repo_dir, "Models", "config.yaml")
models_path = os.path.join(repo_dir, "Models", "model.pth")
model = StyleTTS2(config_path, models_path).eval().to(device)
voice_path = os.path.join(repo_dir, "reference_audio")
eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")]
eg_texts = [
"Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.",
"[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.",
]
# Core inference function
@spaces.GPU
def main(reference_paths, text_prompt, denoise, avg_style, stabilize):
try:
speakers = {}
for i, path in enumerate(reference_paths, 1):
speaker_id = f"id_{i}"
speakers[speaker_id] = {
"path": path,
"lang": "vi",
"speed": 1.0
}
with torch.no_grad():
styles = model.get_styles(speakers, denoise, avg_style)
r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]")
r = r / np.abs(r).max()
sf.write("output.wav", r, samplerate=24000)
return "output.wav", "Audio generated successfully!"
except Exception as e:
error_message = traceback.format_exc()
return None, error_message
def on_file_upload(file_list):
if not file_list:
return None, "No file uploaded yet."
unique_files = {}
for file_path in file_list:
file_name = os.path.basename(file_path)
unique_files[file_name] = file_path #update and remove duplicate
uploaded_infos = []
uploaded_file_names = list(unique_files.keys())
for i in range(len(uploaded_file_names)):
uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}")
summary = "\n".join(uploaded_infos)
return list(unique_files.values()), f"Current reference audios:\n{summary}"
def gen_example(reference_paths, text_prompt):
output, status = main(reference_paths, text_prompt, 0.6, True, True)
return output, reference_paths, status
# Gradio UI
with gr.Blocks() as demo:
gr.HTML("<h1 style='text-align: center;'>StyleTTS2‑Lite Demo</h1>")
gr.Markdown(
"Download the local inference package from Hugging Face: "
"[StyleTTS2‑Lite (Vietnamese)]"
"(https://huggingface.co/dangtr0408/StyleTTS2-lite-vi/)."
)
gr.Markdown(
"Annotate any non‑Vietnamese words with the appropriate language tag, e.g., [en-us]{ } for English. For more information, see "
"[eSpeakNG docs]"
"(https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)"
)
with gr.Row(equal_height=True):
with gr.Column(scale=1):
text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", lines=4)
with gr.Column(scale=1):
avg_style = gr.Checkbox(label="Use Average Styles", value=True)
stabilize = gr.Checkbox(label="Stabilize Speaking Speed", value=True)
denoise = gr.Slider(0.0, 1.0, step=0.1, value=0.6, label="Denoise Strength")
with gr.Row(equal_height=True):
with gr.Column(scale=1):
reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3"], file_count="multiple", height=150)
gen_button = gr.Button("Generate")
with gr.Column(scale=1):
synthesized_audio = gr.Audio(label="Generate Audio", type="filepath")
status = gr.Textbox(label="Status", interactive=False, lines=3)
reference_audios.change(
on_file_upload,
inputs=[reference_audios],
outputs=[reference_audios, status]
)
gen_button.click(
fn=main,
inputs=[
reference_audios,
text_prompt,
denoise,
avg_style,
stabilize
],
outputs=[synthesized_audio, status]
)
gr.Examples(
examples=[[[eg_voices[0]], eg_texts[0]], [eg_voices, eg_texts[1]]],
inputs=[reference_audios, text_prompt],
outputs=[synthesized_audio, reference_audios, status],
fn=gen_example,
cache_examples=False,
label="Examples",
run_on_click=True
)
demo.launch() |