EpXTTS / app.py
Epchannel
first commit
9b20cba
import os
import torch
import gradio as gr
from datetime import datetime
from vinorm import TTSnorm
from underthesea import sent_tokenize
from unidecode import unidecode
import soundfile as sf
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from huggingface_hub import snapshot_download
import os
# Tải model nếu chưa có
if not os.path.exists("model/model.pth"):
snapshot_download(repo_id="epchannel/EpXTTS", repo_type="model", local_dir="model")
# Load XTTS model
def load_model():
config = XttsConfig()
config.load_json("model/config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_path="model/model.pth", vocab_path="model/vocab.json")
if torch.cuda.is_available():
model.cuda()
return model
# Chuẩn hóa văn bản tiếng Việt
def normalize_vietnamese_text(text):
return (
TTSnorm(text, unknown=False, lower=False, rule=True)
.replace("..", ".").replace("!.", "!").replace("?.", "?")
.replace(" .", ".").replace(" ,", ",").replace('"', "")
.replace("'", "").replace("AI", "Ây Ai").replace("A.I", "Ây Ai")
.replace("anh/chị", "anh chị")
)
# Tạo tên file
def get_file_name(text, max_char=50):
filename = unidecode(text[:max_char].lower().replace(" ", "_"))
timestamp = datetime.now().strftime("%m%d%H%M%S")
return f"{timestamp}_{filename}"
# Sinh tiếng nói
def synthesize(text, voice_choice):
model = load_model()
ref_audio = f"model/samples/{voice_choice}.wav"
# Prepare speaker embedding
gpt_latent, speaker_embed = model.get_conditioning_latents(
audio_path=ref_audio,
gpt_cond_len=model.config.gpt_cond_len,
max_ref_length=model.config.max_ref_len,
sound_norm_refs=model.config.sound_norm_refs,
)
try:
text = normalize_vietnamese_text(text)
except:
pass
sentences = sent_tokenize(text)
wav_chunks = []
for sent in sentences:
if sent.strip() == "":
continue
wav = model.inference(
text=sent,
language="vi",
gpt_cond_latent=gpt_latent,
speaker_embedding=speaker_embed,
temperature=0.5,
top_k=20,
top_p=0.85,
repetition_penalty=5.0,
)
wav_chunks.append(torch.tensor(wav["wav"]))
final_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0)
filename = f"./output/{get_file_name(text)}.mp3"
os.makedirs("output", exist_ok=True)
sf.write(filename, final_wav.squeeze(0).numpy(), 24000, format='MP3')
return filename
# Giao diện Gradio
voices = {
"Bống Xinh": "bongxinh",
"Nam Calm": "nam-calm",
"Nam Cham": "nam-cham",
"Nam Truyền cảm": "nam-truyen-cam",
"Nữ Lưu Loát": "nu-luu-loat",
"Nữ Nhẹ Nhàng": "nu-nhe-nhang",
# Thêm các giọng bạn có...
}
with gr.Blocks() as demo:
gr.Markdown("## 🇻🇳 Text to Speech tiếng Việt (XTTS)")
with gr.Row():
text_input = gr.Textbox(label="Nhập văn bản", lines=5, placeholder="Nhập văn bản tiếng Việt...")
voice_choice = gr.Radio(choices=list(voices.keys()), label="Chọn giọng đọc", value="Bông Xinh")
btn = gr.Button("🎙️ Chuyển thành giọng nói")
audio_output = gr.Audio(label="🔊 Kết quả")
def process(text, voice_label):
file = synthesize(text, voices[voice_label])
return file
btn.click(fn=process, inputs=[text_input, voice_choice], outputs=audio_output)
demo.launch()