from KOKORO.models import build_model from KOKORO.utils import tts,tts_file_name import sys sys.path.append('.') import os os.system("python download_model.py") import torch import gc import platform import gradio as gr import shutil # Initialize model print("Loading model...") device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f'Using device: {device}') MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device) print("Model loaded successfully.") # Get list of available voices voice_list = [ os.path.splitext(filename)[0] for filename in os.listdir("./KOKORO/voices") if filename.endswith('.pt') ] voice_list = sorted(voice_list, key=len) model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"] current_model = model_list[0] def update_model(model_name): """Updates the TTS model only if the specified model is not already loaded.""" global MODEL, current_model if current_model == model_name: return f"Model already set to {model_name}" model_path = f"./KOKORO/{model_name}" if model_name == "kokoro-v0_19-half.pth": model_path = f"./KOKORO/fp16/{model_name}" del MODEL gc.collect() torch.cuda.empty_cache() MODEL = build_model(model_path, device) current_model = model_name return f"Model updated to {model_name}" def manage_files(file_path): """Validates uploaded voicepack files.""" if os.path.exists(file_path): file_extension = os.path.splitext(file_path)[1] file_size = os.path.getsize(file_path) if file_extension == ".pt" and file_size <= 5 * 1024 * 1024: return True else: os.remove(file_path) return False return False def text_to_speech(text, model_name="kokoro-v0_19.pth", voice_name="af", speed=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20, custom_voicepack=None, trim=0.0): """Converts text to speech using specified parameters.""" update_model(model_name) if not minimum_silence: minimum_silence = 0.05 save_at = tts_file_name(text) if custom_voicepack: if manage_files(custom_voicepack): voice_name = custom_voicepack else: gr.Warning("Invalid voicepack file. Using default voice instead.") audio_path = tts(MODEL, device, text, voice_name, speed=speed, trim=trim, pad_between_segments=pad_between_segments, output_file=save_at, remove_silence=remove_silence, minimum_silence=minimum_silence) return audio_path def toggle_autoplay(autoplay): return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay) # Main Gradio interface with gr.Blocks() as demo: gr.Markdown("# Kokoro TTS - Batched Text-to-Speech") with gr.Row(): with gr.Column(): text = gr.Textbox(label='Enter Text', lines=3, placeholder="Type your text here...") with gr.Row(): voice = gr.Dropdown(voice_list, value='af_bella', allow_custom_value=False, label='Voice', info='Select a voice') with gr.Row(): generate_btn = gr.Button('Generate', variant='primary') with gr.Accordion('Audio Settings', open=False): model_name = gr.Dropdown(model_list, label="Model", value=model_list[0]) speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1, label='Speed', info='Adjust speaking speed') remove_silence = gr.Checkbox(value=False, label='Remove Silence') minimum_silence = gr.Number(label="Minimum Silence (seconds)", value=0.05) pad_between = gr.Slider(minimum=0, maximum=2, value=0, step=0.1, label='Pad Between', info='Silent duration between segments') custom_voicepack = gr.File(label='Upload Custom VoicePack (.pt file)') with gr.Column(): audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True) with gr.Accordion('Autoplay Settings', open=False): autoplay = gr.Checkbox(value=True, label='Autoplay') autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio]) text.submit(text_to_speech, inputs=[text, model_name, voice, speed, pad_between, remove_silence, minimum_silence, custom_voicepack], outputs=[audio]) generate_btn.click(text_to_speech, inputs=[text, model_name, voice, speed, pad_between, remove_silence, minimum_silence, custom_voicepack], outputs=[audio]) if __name__ == "__main__": demo.queue().launch()