File size: 4,863 Bytes
6d89762
bff0a67
6d89762
 
49ef1cb
34b524b
6d89762
 
3a5f7f4
bff0a67
3a5f7f4
 
bff0a67
6d89762
 
 
3a5f7f4
6d89762
 
bff0a67
 
 
 
 
 
 
6d89762
 
3a5f7f4
6d89762
 
bff0a67
6d89762
 
bff0a67
 
6d89762
bff0a67
 
6d89762
bff0a67
6d89762
 
 
 
3a5f7f4
bff0a67
3a5f7f4
bff0a67
 
3a5f7f4
bff0a67
3a5f7f4
bff0a67
3a5f7f4
bff0a67
49ef1cb
bff0a67
 
 
 
 
6d89762
 
 
bff0a67
3a5f7f4
 
 
 
bff0a67
 
 
 
 
6d89762
 
 
 
 
bff0a67
 
 
3a5f7f4
2b20da1
 
bff0a67
2b20da1
bff0a67
 
2b20da1
bff0a67
 
2b20da1
bff0a67
 
 
 
 
 
 
 
2b20da1
 
 
bff0a67
2b20da1
 
 
bff0a67
 
 
 
49ef1cb
bff0a67
 
 
 
6d89762
 
bff0a67
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from KOKORO.models import build_model
from KOKORO.utils import tts,tts_file_name
import sys
sys.path.append('.')
import os 
os.system("python download_model.py")
import torch
import gc 
import platform
import gradio as gr
import shutil

# Initialize model
print("Loading model...")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')
MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
print("Model loaded successfully.")

# Get list of available voices
voice_list = [
    os.path.splitext(filename)[0]
    for filename in os.listdir("./KOKORO/voices")
    if filename.endswith('.pt')
]
voice_list = sorted(voice_list, key=len)

model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
current_model = model_list[0]

def update_model(model_name):
    """Updates the TTS model only if the specified model is not already loaded."""
    global MODEL, current_model
    if current_model == model_name:
        return f"Model already set to {model_name}"
    model_path = f"./KOKORO/{model_name}"
    if model_name == "kokoro-v0_19-half.pth":
        model_path = f"./KOKORO/fp16/{model_name}"
    del MODEL
    gc.collect()
    torch.cuda.empty_cache()
    MODEL = build_model(model_path, device)
    current_model = model_name
    return f"Model updated to {model_name}"

def manage_files(file_path):
    """Validates uploaded voicepack files."""
    if os.path.exists(file_path):
        file_extension = os.path.splitext(file_path)[1]
        file_size = os.path.getsize(file_path)
        if file_extension == ".pt" and file_size <= 5 * 1024 * 1024:
            return True
        else:
            os.remove(file_path)
            return False
    return False

def text_to_speech(text, model_name="kokoro-v0_19.pth", voice_name="af", speed=1.0, 
                   pad_between_segments=0, remove_silence=True, minimum_silence=0.20,
                   custom_voicepack=None, trim=0.0):
    """Converts text to speech using specified parameters."""
    update_model(model_name)
    if not minimum_silence:
        minimum_silence = 0.05
    save_at = tts_file_name(text)
    
    if custom_voicepack:
        if manage_files(custom_voicepack):
            voice_name = custom_voicepack
        else:
            gr.Warning("Invalid voicepack file. Using default voice instead.")
    
    audio_path = tts(MODEL, device, text, voice_name, speed=speed, trim=trim,
                    pad_between_segments=pad_between_segments, output_file=save_at,
                    remove_silence=remove_silence, minimum_silence=minimum_silence)
    return audio_path

def toggle_autoplay(autoplay):
    return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)

# Main Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Kokoro TTS - Batched Text-to-Speech")
    
    with gr.Row():
        with gr.Column():
            text = gr.Textbox(label='Enter Text', lines=3, placeholder="Type your text here...")
            with gr.Row():
                voice = gr.Dropdown(voice_list, value='af_bella', allow_custom_value=False, 
                                  label='Voice', info='Select a voice')
            with gr.Row():
                generate_btn = gr.Button('Generate', variant='primary')
            
            with gr.Accordion('Audio Settings', open=False):
                model_name = gr.Dropdown(model_list, label="Model", value=model_list[0])
                speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1, 
                                label='Speed', info='Adjust speaking speed')
                remove_silence = gr.Checkbox(value=False, label='Remove Silence')
                minimum_silence = gr.Number(label="Minimum Silence (seconds)", value=0.05)
                pad_between = gr.Slider(minimum=0, maximum=2, value=0, step=0.1,
                                      label='Pad Between', info='Silent duration between segments')
                custom_voicepack = gr.File(label='Upload Custom VoicePack (.pt file)')
                
        with gr.Column():
            audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
            with gr.Accordion('Autoplay Settings', open=False):
                autoplay = gr.Checkbox(value=True, label='Autoplay')
                autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])

    text.submit(text_to_speech, 
               inputs=[text, model_name, voice, speed, pad_between, remove_silence, 
                      minimum_silence, custom_voicepack], 
               outputs=[audio])
    
    generate_btn.click(text_to_speech,
                      inputs=[text, model_name, voice, speed, pad_between, remove_silence,
                             minimum_silence, custom_voicepack],
                      outputs=[audio])

if __name__ == "__main__":
    demo.queue().launch()