|
import os |
|
import torch |
|
import gradio as gr |
|
from datetime import datetime |
|
from vinorm import TTSnorm |
|
from underthesea import sent_tokenize |
|
from unidecode import unidecode |
|
import soundfile as sf |
|
from TTS.tts.configs.xtts_config import XttsConfig |
|
from TTS.tts.models.xtts import Xtts |
|
from huggingface_hub import snapshot_download |
|
import os |
|
|
|
|
|
if not os.path.exists("model/model.pth"): |
|
snapshot_download(repo_id="epchannel/EpXTTS", repo_type="model", local_dir="model") |
|
|
|
|
|
def load_model(): |
|
config = XttsConfig() |
|
config.load_json("model/config.json") |
|
model = Xtts.init_from_config(config) |
|
model.load_checkpoint(config, checkpoint_path="model/model.pth", vocab_path="model/vocab.json") |
|
if torch.cuda.is_available(): |
|
model.cuda() |
|
return model |
|
|
|
|
|
def normalize_vietnamese_text(text): |
|
return ( |
|
TTSnorm(text, unknown=False, lower=False, rule=True) |
|
.replace("..", ".").replace("!.", "!").replace("?.", "?") |
|
.replace(" .", ".").replace(" ,", ",").replace('"', "") |
|
.replace("'", "").replace("AI", "Ây Ai").replace("A.I", "Ây Ai") |
|
.replace("anh/chị", "anh chị") |
|
) |
|
|
|
|
|
def get_file_name(text, max_char=50): |
|
filename = unidecode(text[:max_char].lower().replace(" ", "_")) |
|
timestamp = datetime.now().strftime("%m%d%H%M%S") |
|
return f"{timestamp}_{filename}" |
|
|
|
|
|
def synthesize(text, voice_choice): |
|
model = load_model() |
|
ref_audio = f"model/samples/{voice_choice}.wav" |
|
|
|
|
|
gpt_latent, speaker_embed = model.get_conditioning_latents( |
|
audio_path=ref_audio, |
|
gpt_cond_len=model.config.gpt_cond_len, |
|
max_ref_length=model.config.max_ref_len, |
|
sound_norm_refs=model.config.sound_norm_refs, |
|
) |
|
|
|
try: |
|
text = normalize_vietnamese_text(text) |
|
except: |
|
pass |
|
|
|
sentences = sent_tokenize(text) |
|
wav_chunks = [] |
|
for sent in sentences: |
|
if sent.strip() == "": |
|
continue |
|
wav = model.inference( |
|
text=sent, |
|
language="vi", |
|
gpt_cond_latent=gpt_latent, |
|
speaker_embedding=speaker_embed, |
|
temperature=0.5, |
|
top_k=20, |
|
top_p=0.85, |
|
repetition_penalty=5.0, |
|
) |
|
wav_chunks.append(torch.tensor(wav["wav"])) |
|
|
|
final_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0) |
|
filename = f"./output/{get_file_name(text)}.mp3" |
|
os.makedirs("output", exist_ok=True) |
|
sf.write(filename, final_wav.squeeze(0).numpy(), 24000, format='MP3') |
|
return filename |
|
|
|
|
|
voices = { |
|
"Bống Xinh": "bongxinh", |
|
"Nam Calm": "nam-calm", |
|
"Nam Cham": "nam-cham", |
|
"Nam Truyền cảm": "nam-truyen-cam", |
|
"Nữ Lưu Loát": "nu-luu-loat", |
|
"Nữ Nhẹ Nhàng": "nu-nhe-nhang", |
|
|
|
} |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## 🇻🇳 Text to Speech tiếng Việt (XTTS)") |
|
with gr.Row(): |
|
text_input = gr.Textbox(label="Nhập văn bản", lines=5, placeholder="Nhập văn bản tiếng Việt...") |
|
voice_choice = gr.Radio(choices=list(voices.keys()), label="Chọn giọng đọc", value="Bông Xinh") |
|
btn = gr.Button("🎙️ Chuyển thành giọng nói") |
|
audio_output = gr.Audio(label="🔊 Kết quả") |
|
|
|
def process(text, voice_label): |
|
file = synthesize(text, voices[voice_label]) |
|
return file |
|
|
|
btn.click(fn=process, inputs=[text_input, voice_choice], outputs=audio_output) |
|
|
|
demo.launch() |
|
|