import gradio as gr import subprocess import os import sys import soundfile as sf import numpy as np import torch import traceback import spaces repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi" repo_dir = "StyleTTS2-lite-vi" if not os.path.exists(repo_dir): subprocess.run(["git", "clone", repo_url, repo_dir]) sys.path.append(os.path.abspath(repo_dir)) from inference import StyleTTS2 device = 'cuda' if torch.cuda.is_available() else 'cpu' config_path = os.path.join(repo_dir, "Models", "config.yaml") models_path = os.path.join(repo_dir, "Models", "model.pth") model = StyleTTS2(config_path, models_path).eval().to(device) voice_path = os.path.join(repo_dir, "reference_audio") eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")] eg_texts = [ "Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.", "[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.", ] # Core inference function @spaces.GPU def main(reference_paths, text_prompt, denoise, avg_style, stabilize): try: speakers = {} for i, path in enumerate(reference_paths, 1): speaker_id = f"id_{i}" speakers[speaker_id] = { "path": path, "lang": "vi", "speed": 1.0 } with torch.no_grad(): styles = model.get_styles(speakers, denoise, avg_style) r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]") r = r / np.abs(r).max() sf.write("output.wav", r, samplerate=24000) return "output.wav", "Audio generated successfully!" except Exception as e: error_message = traceback.format_exc() return None, error_message def on_file_upload(file_list): if not file_list: return None, "No file uploaded yet." unique_files = {} for file_path in file_list: file_name = os.path.basename(file_path) unique_files[file_name] = file_path #update and remove duplicate uploaded_infos = [] uploaded_file_names = list(unique_files.keys()) for i in range(len(uploaded_file_names)): uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}") summary = "\n".join(uploaded_infos) return list(unique_files.values()), f"Current reference audios:\n{summary}" def gen_example(reference_paths, text_prompt): output, status = main(reference_paths, text_prompt, 0.6, True, True) return output, reference_paths, status # Gradio UI with gr.Blocks() as demo: gr.HTML("

StyleTTS2‑Lite Demo

") gr.Markdown( "Download the local inference package from Hugging Face: " "[StyleTTS2‑Lite (Vietnamese)]" "(https://huggingface.co/dangtr0408/StyleTTS2-lite-vi/)." ) gr.Markdown( "Annotate any non‑Vietnamese words with the appropriate language tag, e.g., [en-us]{ } for English. For more information, see " "[eSpeakNG docs]" "(https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)" ) with gr.Row(equal_height=True): with gr.Column(scale=1): text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", lines=4) with gr.Column(scale=1): avg_style = gr.Checkbox(label="Use Average Styles", value=True) stabilize = gr.Checkbox(label="Stabilize Speaking Speed", value=True) denoise = gr.Slider(0.0, 1.0, step=0.1, value=0.6, label="Denoise Strength") with gr.Row(equal_height=True): with gr.Column(scale=1): reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3"], file_count="multiple", height=150) gen_button = gr.Button("Generate") with gr.Column(scale=1): synthesized_audio = gr.Audio(label="Generate Audio", type="filepath") status = gr.Textbox(label="Status", interactive=False, lines=3) reference_audios.change( on_file_upload, inputs=[reference_audios], outputs=[reference_audios, status] ) gen_button.click( fn=main, inputs=[ reference_audios, text_prompt, denoise, avg_style, stabilize ], outputs=[synthesized_audio, status] ) gr.Examples( examples=[[[eg_voices[0]], eg_texts[0]], [eg_voices, eg_texts[1]]], inputs=[reference_audios, text_prompt], outputs=[synthesized_audio, reference_audios, status], fn=gen_example, cache_examples=False, label="Examples", run_on_click=True ) demo.launch()