Spaces:

epchannel
/

EpXTTS

Sleeping

EpXTTS / app.py

Epchannel

first commit

9b20cba about 1 month ago

3.61 kB

	import os
	import torch
	import gradio as gr
	from datetime import datetime
	from vinorm import TTSnorm
	from underthesea import sent_tokenize
	from unidecode import unidecode
	import soundfile as sf
	from TTS.tts.configs.xtts_config import XttsConfig
	from TTS.tts.models.xtts import Xtts
	from huggingface_hub import snapshot_download
	import os

	# Tải model nếu chưa có
	if not os.path.exists("model/model.pth"):
	snapshot_download(repo_id="epchannel/EpXTTS", repo_type="model", local_dir="model")

	# Load XTTS model
	def load_model():
	config = XttsConfig()
	config.load_json("model/config.json")
	model = Xtts.init_from_config(config)
	model.load_checkpoint(config, checkpoint_path="model/model.pth", vocab_path="model/vocab.json")
	if torch.cuda.is_available():
	model.cuda()
	return model

	# Chuẩn hóa văn bản tiếng Việt
	def normalize_vietnamese_text(text):
	return (
	TTSnorm(text, unknown=False, lower=False, rule=True)
	.replace("..", ".").replace("!.", "!").replace("?.", "?")
	.replace(" .", ".").replace(" ,", ",").replace('"', "")
	.replace("'", "").replace("AI", "Ây Ai").replace("A.I", "Ây Ai")
	.replace("anh/chị", "anh chị")
	)

	# Tạo tên file
	def get_file_name(text, max_char=50):
	filename = unidecode(text[:max_char].lower().replace(" ", "_"))
	timestamp = datetime.now().strftime("%m%d%H%M%S")
	return f"{timestamp}_{filename}"

	# Sinh tiếng nói
	def synthesize(text, voice_choice):
	model = load_model()
	ref_audio = f"model/samples/{voice_choice}.wav"

	# Prepare speaker embedding
	gpt_latent, speaker_embed = model.get_conditioning_latents(
	audio_path=ref_audio,
	gpt_cond_len=model.config.gpt_cond_len,
	max_ref_length=model.config.max_ref_len,
	sound_norm_refs=model.config.sound_norm_refs,
	)

	try:
	text = normalize_vietnamese_text(text)
	except:
	pass

	sentences = sent_tokenize(text)
	wav_chunks = []
	for sent in sentences:
	if sent.strip() == "":
	continue
	wav = model.inference(
	text=sent,
	language="vi",
	gpt_cond_latent=gpt_latent,
	speaker_embedding=speaker_embed,
	temperature=0.5,
	top_k=20,
	top_p=0.85,
	repetition_penalty=5.0,
	)
	wav_chunks.append(torch.tensor(wav["wav"]))

	final_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0)
	filename = f"./output/{get_file_name(text)}.mp3"
	os.makedirs("output", exist_ok=True)
	sf.write(filename, final_wav.squeeze(0).numpy(), 24000, format='MP3')
	return filename

	# Giao diện Gradio
	voices = {
	"Bống Xinh": "bongxinh",
	"Nam Calm": "nam-calm",
	"Nam Cham": "nam-cham",
	"Nam Truyền cảm": "nam-truyen-cam",
	"Nữ Lưu Loát": "nu-luu-loat",
	"Nữ Nhẹ Nhàng": "nu-nhe-nhang",
	# Thêm các giọng bạn có...
	}


	with gr.Blocks() as demo:
	gr.Markdown("## 🇻🇳 Text to Speech tiếng Việt (XTTS)")
	with gr.Row():
	text_input = gr.Textbox(label="Nhập văn bản", lines=5, placeholder="Nhập văn bản tiếng Việt...")
	voice_choice = gr.Radio(choices=list(voices.keys()), label="Chọn giọng đọc", value="Bông Xinh")
	btn = gr.Button("🎙️ Chuyển thành giọng nói")
	audio_output = gr.Audio(label="🔊 Kết quả")

	def process(text, voice_label):
	file = synthesize(text, voices[voice_label])
	return file

	btn.click(fn=process, inputs=[text_input, voice_choice], outputs=audio_output)

	demo.launch()