|
from KOKORO.models import build_model |
|
from KOKORO.utils import tts,tts_file_name |
|
import sys |
|
sys.path.append('.') |
|
import os |
|
os.system("python download_model.py") |
|
import torch |
|
import gc |
|
import platform |
|
import gradio as gr |
|
import shutil |
|
|
|
|
|
print("Loading model...") |
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
print(f'Using device: {device}') |
|
MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device) |
|
print("Model loaded successfully.") |
|
|
|
|
|
voice_list = [ |
|
os.path.splitext(filename)[0] |
|
for filename in os.listdir("./KOKORO/voices") |
|
if filename.endswith('.pt') |
|
] |
|
voice_list = sorted(voice_list, key=len) |
|
|
|
model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"] |
|
current_model = model_list[0] |
|
|
|
def update_model(model_name): |
|
"""Updates the TTS model only if the specified model is not already loaded.""" |
|
global MODEL, current_model |
|
if current_model == model_name: |
|
return f"Model already set to {model_name}" |
|
model_path = f"./KOKORO/{model_name}" |
|
if model_name == "kokoro-v0_19-half.pth": |
|
model_path = f"./KOKORO/fp16/{model_name}" |
|
del MODEL |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
MODEL = build_model(model_path, device) |
|
current_model = model_name |
|
return f"Model updated to {model_name}" |
|
|
|
def manage_files(file_path): |
|
"""Validates uploaded voicepack files.""" |
|
if os.path.exists(file_path): |
|
file_extension = os.path.splitext(file_path)[1] |
|
file_size = os.path.getsize(file_path) |
|
if file_extension == ".pt" and file_size <= 5 * 1024 * 1024: |
|
return True |
|
else: |
|
os.remove(file_path) |
|
return False |
|
return False |
|
|
|
def text_to_speech(text, model_name="kokoro-v0_19.pth", voice_name="af", speed=1.0, |
|
pad_between_segments=0, remove_silence=True, minimum_silence=0.20, |
|
custom_voicepack=None, trim=0.0): |
|
"""Converts text to speech using specified parameters.""" |
|
update_model(model_name) |
|
if not minimum_silence: |
|
minimum_silence = 0.05 |
|
save_at = tts_file_name(text) |
|
|
|
if custom_voicepack: |
|
if manage_files(custom_voicepack): |
|
voice_name = custom_voicepack |
|
else: |
|
gr.Warning("Invalid voicepack file. Using default voice instead.") |
|
|
|
audio_path = tts(MODEL, device, text, voice_name, speed=speed, trim=trim, |
|
pad_between_segments=pad_between_segments, output_file=save_at, |
|
remove_silence=remove_silence, minimum_silence=minimum_silence) |
|
return audio_path |
|
|
|
def toggle_autoplay(autoplay): |
|
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Kokoro TTS - Batched Text-to-Speech") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
text = gr.Textbox(label='Enter Text', lines=3, placeholder="Type your text here...") |
|
with gr.Row(): |
|
voice = gr.Dropdown(voice_list, value='af_bella', allow_custom_value=False, |
|
label='Voice', info='Select a voice') |
|
with gr.Row(): |
|
generate_btn = gr.Button('Generate', variant='primary') |
|
|
|
with gr.Accordion('Audio Settings', open=False): |
|
model_name = gr.Dropdown(model_list, label="Model", value=model_list[0]) |
|
speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1, |
|
label='Speed', info='Adjust speaking speed') |
|
remove_silence = gr.Checkbox(value=False, label='Remove Silence') |
|
minimum_silence = gr.Number(label="Minimum Silence (seconds)", value=0.05) |
|
pad_between = gr.Slider(minimum=0, maximum=2, value=0, step=0.1, |
|
label='Pad Between', info='Silent duration between segments') |
|
custom_voicepack = gr.File(label='Upload Custom VoicePack (.pt file)') |
|
|
|
with gr.Column(): |
|
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True) |
|
with gr.Accordion('Autoplay Settings', open=False): |
|
autoplay = gr.Checkbox(value=True, label='Autoplay') |
|
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio]) |
|
|
|
text.submit(text_to_speech, |
|
inputs=[text, model_name, voice, speed, pad_between, remove_silence, |
|
minimum_silence, custom_voicepack], |
|
outputs=[audio]) |
|
|
|
generate_btn.click(text_to_speech, |
|
inputs=[text, model_name, voice, speed, pad_between, remove_silence, |
|
minimum_silence, custom_voicepack], |
|
outputs=[audio]) |
|
|
|
if __name__ == "__main__": |
|
demo.queue().launch() |