StyleTTS2‑Lite Demo

import gradio as gr
import subprocess
import os
import sys
import soundfile as sf
import numpy as np
import torch
import traceback
import spaces

repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite-vi"
repo_dir = "StyleTTS2-lite-vi"
if not os.path.exists(repo_dir):
    subprocess.run(["git", "clone", repo_url, repo_dir])
sys.path.append(os.path.abspath(repo_dir))
from inference import StyleTTS2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
config_path = os.path.join(repo_dir, "Models", "config.yaml")
models_path = os.path.join(repo_dir, "Models", "inference", "model.pth")
model = StyleTTS2(config_path, models_path).eval().to(device)
voice_path = os.path.join(repo_dir, "reference_audio")
eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")]
eg_texts = [
    "Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.",
    "[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.",
]


# Core inference function
@spaces.GPU
def main(reference_paths, text_prompt, denoise, avg_style, stabilize):
    try:
        speakers = {}
        for i, path in enumerate(reference_paths, 1):
            speaker_id = f"id_{i}"
            speakers[speaker_id] = {
                "path": path,
                "lang": "vi",
                "speed": 1.0
        }

        with torch.no_grad():
            styles = model.get_styles(speakers, denoise, avg_style)
            r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]")
            r = r / np.abs(r).max()
            
        sf.write("output.wav", r, samplerate=24000)
        return "output.wav", "Audio generated successfully!"
    
    except Exception as e:
        error_message = traceback.format_exc()
        return None, error_message

def on_file_upload(file_list):
    if not file_list:
        return None, "No file uploaded yet."
    
    unique_files = {}
    for file_path in file_list:
        file_name = os.path.basename(file_path)
        unique_files[file_name] = file_path #update and remove duplicate

    uploaded_infos = []
    uploaded_file_names = list(unique_files.keys())
    for i in range(len(uploaded_file_names)):
        uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}")
        
    summary = "\n".join(uploaded_infos)
    return list(unique_files.values()), f"Current reference audios:\n{summary}"

def gen_example(reference_paths, text_prompt):
    output, status = main(reference_paths, text_prompt, 0.6, True, True)
    return output, reference_paths, status


# Gradio UI
with gr.Blocks() as demo:
    gr.HTML("<h1 style='text-align: center;'>StyleTTS2‑Lite Demo</h1>")
    gr.Markdown(
        "Download the local inference package from Hugging Face: "
        "[StyleTTS2‑Lite (Vietnamese)]"
        "(https://huggingface.co/dangtr0408/StyleTTS2-lite-vi/)."
    )
    gr.Markdown(
        "Annotate any non‑Vietnamese words with the appropriate language tag, e.g., [en-us]{  } for English. For more information, see "
        "[eSpeakNG docs]"
        "(https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)"
    )

    with gr.Row(equal_height=True):
        with gr.Column(scale=1):
            text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", lines=4)
        with gr.Column(scale=1):
            avg_style = gr.Checkbox(label="Use Average Styles", value=True)
            stabilize = gr.Checkbox(label="Stabilize Speaking Speed", value=True)
            denoise = gr.Slider(0.0, 1.0, step=0.1, value=0.6, label="Denoise Strength")

    with gr.Row(equal_height=True):
        with gr.Column(scale=1):
            reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3"], file_count="multiple", height=150)
            gen_button = gr.Button("Generate")
        with gr.Column(scale=1):
            synthesized_audio = gr.Audio(label="Generate Audio", type="filepath")

    status = gr.Textbox(label="Status", interactive=False, lines=3)

    reference_audios.change(
        on_file_upload, 
        inputs=[reference_audios], 
        outputs=[reference_audios, status]
    )

    gen_button.click(
        fn=main,
        inputs=[
            reference_audios,
            text_prompt,
            denoise,
            avg_style,
            stabilize
        ],
        outputs=[synthesized_audio, status]
    )

    gr.Examples(
        examples=[[[eg_voices[0]], eg_texts[0]], [eg_voices, eg_texts[1]]],
        inputs=[reference_audios, text_prompt],
        outputs=[synthesized_audio, reference_audios, status],
        fn=gen_example,
        cache_examples=False,
        label="Examples",
        run_on_click=True
    )

demo.launch()