Ktk / app.py
hivecorp's picture
Update app.py
bff0a67 verified
from KOKORO.models import build_model
from KOKORO.utils import tts,tts_file_name
import sys
sys.path.append('.')
import os
os.system("python download_model.py")
import torch
import gc
import platform
import gradio as gr
import shutil
# Initialize model
print("Loading model...")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')
MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
print("Model loaded successfully.")
# Get list of available voices
voice_list = [
os.path.splitext(filename)[0]
for filename in os.listdir("./KOKORO/voices")
if filename.endswith('.pt')
]
voice_list = sorted(voice_list, key=len)
model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
current_model = model_list[0]
def update_model(model_name):
"""Updates the TTS model only if the specified model is not already loaded."""
global MODEL, current_model
if current_model == model_name:
return f"Model already set to {model_name}"
model_path = f"./KOKORO/{model_name}"
if model_name == "kokoro-v0_19-half.pth":
model_path = f"./KOKORO/fp16/{model_name}"
del MODEL
gc.collect()
torch.cuda.empty_cache()
MODEL = build_model(model_path, device)
current_model = model_name
return f"Model updated to {model_name}"
def manage_files(file_path):
"""Validates uploaded voicepack files."""
if os.path.exists(file_path):
file_extension = os.path.splitext(file_path)[1]
file_size = os.path.getsize(file_path)
if file_extension == ".pt" and file_size <= 5 * 1024 * 1024:
return True
else:
os.remove(file_path)
return False
return False
def text_to_speech(text, model_name="kokoro-v0_19.pth", voice_name="af", speed=1.0,
pad_between_segments=0, remove_silence=True, minimum_silence=0.20,
custom_voicepack=None, trim=0.0):
"""Converts text to speech using specified parameters."""
update_model(model_name)
if not minimum_silence:
minimum_silence = 0.05
save_at = tts_file_name(text)
if custom_voicepack:
if manage_files(custom_voicepack):
voice_name = custom_voicepack
else:
gr.Warning("Invalid voicepack file. Using default voice instead.")
audio_path = tts(MODEL, device, text, voice_name, speed=speed, trim=trim,
pad_between_segments=pad_between_segments, output_file=save_at,
remove_silence=remove_silence, minimum_silence=minimum_silence)
return audio_path
def toggle_autoplay(autoplay):
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
# Main Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Kokoro TTS - Batched Text-to-Speech")
with gr.Row():
with gr.Column():
text = gr.Textbox(label='Enter Text', lines=3, placeholder="Type your text here...")
with gr.Row():
voice = gr.Dropdown(voice_list, value='af_bella', allow_custom_value=False,
label='Voice', info='Select a voice')
with gr.Row():
generate_btn = gr.Button('Generate', variant='primary')
with gr.Accordion('Audio Settings', open=False):
model_name = gr.Dropdown(model_list, label="Model", value=model_list[0])
speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1,
label='Speed', info='Adjust speaking speed')
remove_silence = gr.Checkbox(value=False, label='Remove Silence')
minimum_silence = gr.Number(label="Minimum Silence (seconds)", value=0.05)
pad_between = gr.Slider(minimum=0, maximum=2, value=0, step=0.1,
label='Pad Between', info='Silent duration between segments')
custom_voicepack = gr.File(label='Upload Custom VoicePack (.pt file)')
with gr.Column():
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
with gr.Accordion('Autoplay Settings', open=False):
autoplay = gr.Checkbox(value=True, label='Autoplay')
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
text.submit(text_to_speech,
inputs=[text, model_name, voice, speed, pad_between, remove_silence,
minimum_silence, custom_voicepack],
outputs=[audio])
generate_btn.click(text_to_speech,
inputs=[text, model_name, voice, speed, pad_between, remove_silence,
minimum_silence, custom_voicepack],
outputs=[audio])
if __name__ == "__main__":
demo.queue().launch()