hivecorp commited on
Commit
bff0a67
·
verified ·
1 Parent(s): 2b20da1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -959
app.py CHANGED
@@ -1,6 +1,5 @@
1
-
2
  from KOKORO.models import build_model
3
- from KOKORO.utils import tts,tts_file_name,podcast
4
  import sys
5
  sys.path.append('.')
6
  import os
@@ -8,1014 +7,115 @@ os.system("python download_model.py")
8
  import torch
9
  import gc
10
  import platform
 
11
  import shutil
12
- base_path=os.getcwd()
13
- def clean_folder_before_start():
14
- global base_path
15
- # folder_list=["dummy","TTS_DUB","kokoro_audio"]
16
- folder_list=["dummy","TTS_DUB"]#,"kokoro_audio"]
17
- for folder in folder_list:
18
- if os.path.exists(f"{base_path}/{folder}"):
19
- try:
20
- shutil.rmtree(f"{base_path}/{folder}")
21
- except:
22
- pass
23
- os.makedirs(f"{base_path}/{folder}", exist_ok=True)
24
- clean_folder_before_start()
25
 
 
26
  print("Loading model...")
27
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
28
  print(f'Using device: {device}')
29
  MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
30
  print("Model loaded successfully.")
31
 
32
- def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
33
- # Sanitize the save_path to remove any newline characters
34
- save_path = save_path.replace('\n', '').replace('\r', '')
35
- global MODEL
36
- audio_path=tts(MODEL,device,text,voice_name,speed=speed,trim=trim,pad_between_segments=pad_between,output_file=save_path,remove_silence=remove_silence,minimum_silence=minimum_silence)
37
- return audio_path
38
-
39
 
40
  model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
41
  current_model = model_list[0]
42
 
43
  def update_model(model_name):
44
- """
45
- Updates the TTS model only if the specified model is not already loaded.
46
- """
47
  global MODEL, current_model
48
  if current_model == model_name:
49
- return f"Model already set to {model_name}" # No need to reload
50
- model_path = f"./KOKORO/{model_name}" # Default model path
51
  if model_name == "kokoro-v0_19-half.pth":
52
- model_path = f"./KOKORO/fp16/{model_name}" # Update path for specific model
53
- # print(f"Loading new model: {model_name}")
54
- del MODEL # Cleanup existing model
55
  gc.collect()
56
- torch.cuda.empty_cache() # Ensure GPU memory is cleared
57
  MODEL = build_model(model_path, device)
58
  current_model = model_name
59
  return f"Model updated to {model_name}"
60
 
61
-
62
  def manage_files(file_path):
 
63
  if os.path.exists(file_path):
64
- file_extension = os.path.splitext(file_path)[1] # Get file extension
65
- file_size = os.path.getsize(file_path) # Get file size in bytes
66
- # Check if file is a valid .pt file and its size is ≤ 5 MB
67
  if file_extension == ".pt" and file_size <= 5 * 1024 * 1024:
68
- return True # File is valid and kept
69
  else:
70
- os.remove(file_path) # Delete invalid or oversized file
71
  return False
72
- return False # File does not exist
73
 
74
-
75
-
76
- def text_to_speech(text, model_name="kokoro-v0_19.pth", voice_name="af", speed=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20,custom_voicepack=None,trim=0.0):
77
- """
78
- Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
79
- """
80
- update_status = update_model(model_name) # Load the model only if required
81
- # print(update_status) # Log model loading status
82
  if not minimum_silence:
83
  minimum_silence = 0.05
84
- keep_silence = int(minimum_silence * 1000)
85
  save_at = tts_file_name(text)
86
- # print(voice_name,custom_voicepack)
87
  if custom_voicepack:
88
  if manage_files(custom_voicepack):
89
  voice_name = custom_voicepack
90
  else:
91
- gr.Warning("Upload small size .pt file only. Using the Current voice pack instead.")
92
- audio_path = tts_maker(
93
- text,
94
- voice_name,
95
- speed,
96
- trim,
97
- pad_between_segments,
98
- save_at,
99
- remove_silence,
100
- keep_silence
101
- )
102
  return audio_path
103
 
104
-
105
-
106
-
107
- import gradio as gr
108
-
109
- # voice_list = [
110
- # 'af', # Default voice is a 50-50 mix of af_bella & af_sarah
111
- # 'af_bella', 'af_sarah', 'am_adam', 'am_michael',
112
- # 'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
113
- # ]
114
-
115
-
116
-
117
- import os
118
-
119
- # Get the list of voice names without file extensions
120
- voice_list = [
121
- os.path.splitext(filename)[0]
122
- for filename in os.listdir("./KOKORO/voices")
123
- if filename.endswith('.pt')
124
- ]
125
-
126
- # Sort the list based on the length of each name
127
- voice_list = sorted(voice_list, key=len)
128
-
129
  def toggle_autoplay(autoplay):
130
  return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
131
 
132
- with gr.Blocks() as demo1:
133
- gr.Markdown("# Batched TTS")
134
- gr.Markdown("[Install on Windows/Linux](https://github.com/NeuralFalconYT/Kokoro-82M-WebUI)")
135
-
136
- with gr.Row():
137
- with gr.Column():
138
- text = gr.Textbox(
139
- label='Enter Text',
140
- lines=3,
141
- placeholder="Type your text here..."
142
- )
143
- with gr.Row():
144
- voice = gr.Dropdown(
145
- voice_list,
146
- value='af_bella',
147
- allow_custom_value=False,
148
- label='Voice',
149
- info='Starred voices are more stable'
150
- )
151
- with gr.Row():
152
- generate_btn = gr.Button('Generate', variant='primary')
153
- with gr.Accordion('Audio Settings', open=False):
154
- model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
155
- speed = gr.Slider(
156
- minimum=0.25, maximum=2, value=1, step=0.1,
157
- label='⚡️Speed', info='Adjust the speaking speed'
158
- )
159
- remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
160
- minimum_silence = gr.Number(
161
- label="Keep Silence Upto (In seconds)",
162
- value=0.05
163
- )
164
-
165
- # trim = gr.Slider(
166
- # minimum=0, maximum=1, value=0, step=0.1,
167
- # label='🔪 Trim', info='How much to cut from both ends of each segment'
168
- # )
169
- pad_between = gr.Slider(
170
- minimum=0, maximum=2, value=0, step=0.1,
171
- label='🔇 Pad Between', info='Silent Duration between segments [For Large Text]'
172
- )
173
-
174
- custom_voicepack = gr.File(label='Upload Custom VoicePack .pt file')
175
-
176
- with gr.Column():
177
- audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
178
- with gr.Accordion('Enable Autoplay', open=False):
179
- autoplay = gr.Checkbox(value=True, label='Autoplay')
180
- autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
181
-
182
- text.submit(
183
- text_to_speech,
184
- inputs=[text, model_name,voice, speed, pad_between, remove_silence, minimum_silence,custom_voicepack],
185
- outputs=[audio]
186
- )
187
- generate_btn.click(
188
- text_to_speech,
189
- inputs=[text,model_name, voice, speed, pad_between, remove_silence, minimum_silence,custom_voicepack],
190
- outputs=[audio]
191
- )
192
-
193
- def podcast_maker(text,remove_silence=False,minimum_silence=50,model_name="kokoro-v0_19.pth"):
194
- global MODEL,device
195
- update_model(model_name)
196
- if not minimum_silence:
197
- minimum_silence = 0.05
198
- keep_silence = int(minimum_silence * 1000)
199
- podcast_save_at=podcast(MODEL, device,text,remove_silence=remove_silence, minimum_silence=keep_silence)
200
- return podcast_save_at
201
-
202
-
203
-
204
- dummpy_example="""{af_alloy} Hello, I'd like to order a sandwich please.
205
- {af_sky} What do you mean you're out of bread?
206
- {af_bella} I really wanted a sandwich though...
207
- {af_nicole} You know what, darn you and your little shop!
208
- {bm_george} I'll just go back home and cry now.
209
- {am_adam} Why me?"""
210
- with gr.Blocks() as demo2:
211
- gr.Markdown(
212
- """
213
- # Multiple Speech-Type Generation
214
- This section allows you to generate multiple speech types or different VOICE PACK's at same text Input. Enter your text in the format shown below, and the system will generate speech using the appropriate type. If unspecified, the model will use "af" voice.
215
- Format:
216
- {voice_name} your text here
217
- """
218
- )
219
- with gr.Row():
220
- gr.Markdown(
221
- """
222
- **Example Input:**
223
- {af_alloy} Hello, I'd like to order a sandwich please.
224
- {af_sky} What do you mean you're out of bread?
225
- {af_bella} I really wanted a sandwich though...
226
- {af_nicole} You know what, darn you and your little shop!
227
- {bm_george} I'll just go back home and cry now.
228
- {am_adam} Why me?!
229
- """
230
- )
231
- with gr.Row():
232
- with gr.Column():
233
- text = gr.Textbox(
234
- label='Enter Text',
235
- lines=7,
236
- placeholder=dummpy_example
237
- )
238
- with gr.Row():
239
- generate_btn = gr.Button('Generate', variant='primary')
240
- with gr.Accordion('Audio Settings', open=False):
241
- remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
242
- minimum_silence = gr.Number(
243
- label="Keep Silence Upto (In seconds)",
244
- value=0.20
245
- )
246
- with gr.Column():
247
- audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
248
- with gr.Accordion('Enable Autoplay', open=False):
249
- autoplay = gr.Checkbox(value=True, label='Autoplay')
250
- autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
251
-
252
- text.submit(
253
- podcast_maker,
254
- inputs=[text, remove_silence, minimum_silence],
255
- outputs=[audio]
256
- )
257
- generate_btn.click(
258
- podcast_maker,
259
- inputs=[text, remove_silence, minimum_silence],
260
- outputs=[audio]
261
- )
262
-
263
-
264
-
265
-
266
- import shutil
267
- import os
268
-
269
- # Ensure the output directory exists
270
- output_dir = "./temp_audio"
271
- os.makedirs(output_dir, exist_ok=True)
272
-
273
-
274
-
275
-
276
-
277
-
278
-
279
-
280
-
281
- #@title Generate Audio File From Subtitle
282
- # from tqdm.notebook import tqdm
283
- from tqdm import tqdm
284
- import subprocess
285
- import json
286
- import pysrt
287
- import os
288
- from pydub import AudioSegment
289
- import shutil
290
- import uuid
291
- import re
292
- import time
293
-
294
- # os.chdir(install_path)
295
-
296
- # def your_tts(text,audio_path,actual_duration,speed=1.0):
297
- # global srt_voice_name
298
- # model_name="kokoro-v0_19.pth"
299
- # tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed,trim=1.0)
300
- # # print(tts_path)
301
- # tts_audio = AudioSegment.from_file(tts_path)
302
- # tts_duration = len(tts_audio)
303
- # if tts_duration > actual_duration:
304
- # speedup_factor = tts_duration / actual_duration
305
- # tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor,trim=1.0)
306
- # # print(tts_path)
307
- # shutil.copy(tts_path,audio_path)
308
-
309
-
310
- def your_tts(text, audio_path, actual_duration, speed=1.0):
311
- global srt_voice_name
312
- model_name = "kokoro-v0_19.pth"
313
-
314
- # Generate TTS audio
315
- tts_path = text_to_speech(text, model_name, voice_name=srt_voice_name, speed=speed, trim=1.0)
316
- tts_audio = AudioSegment.from_file(tts_path)
317
- tts_duration = len(tts_audio)
318
-
319
- if actual_duration > 0:
320
- if tts_duration > actual_duration:
321
- speedup_factor = tts_duration / actual_duration
322
- tts_path = text_to_speech(text, model_name, voice_name=srt_voice_name, speed=speedup_factor, trim=1.0)
323
- else:
324
- pass
325
-
326
- shutil.copy(tts_path, audio_path)
327
-
328
-
329
-
330
-
331
-
332
- base_path="."
333
- import datetime
334
- def get_current_time():
335
- # Return current time as a string in the format HH_MM_AM/PM
336
- return datetime.datetime.now().strftime("%I_%M_%p")
337
-
338
- def get_subtitle_Dub_path(srt_file_path,Language="en"):
339
- file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
340
- if not os.path.exists(f"{base_path}/TTS_DUB"):
341
- os.mkdir(f"{base_path}/TTS_DUB")
342
- random_string = str(uuid.uuid4())[:6]
343
- new_path=f"{base_path}/TTS_DUB/{file_name}_{Language}_{get_current_time()}_{random_string}.wav"
344
- return new_path
345
-
346
-
347
-
348
-
349
-
350
-
351
-
352
-
353
- def clean_srt(input_path):
354
- file_name = os.path.basename(input_path)
355
- output_folder = f"{base_path}/save_srt"
356
- if not os.path.exists(output_folder):
357
- os.mkdir(output_folder)
358
- output_path = f"{output_folder}/{file_name}"
359
-
360
- def clean_srt_line(text):
361
- bad_list = ["[", "]", "♫", "\n"]
362
- for i in bad_list:
363
- text = text.replace(i, "")
364
- return text.strip()
365
-
366
- # Load the subtitle file
367
- subs = pysrt.open(input_path)
368
-
369
- # Iterate through each subtitle and print its details
370
- with open(output_path, "w", encoding='utf-8') as file:
371
- for sub in subs:
372
- file.write(f"{sub.index}\n")
373
- file.write(f"{sub.start} --> {sub.end}\n")
374
- file.write(f"{clean_srt_line(sub.text)}\n")
375
- file.write("\n")
376
- file.close()
377
- # print(f"Clean SRT saved at: {output_path}")
378
- return output_path
379
- # Example usage
380
-
381
-
382
-
383
-
384
- import librosa
385
- import soundfile as sf
386
- import subprocess
387
-
388
- def speedup_audio_librosa(input_file, output_file, speedup_factor):
389
- try:
390
- # Load the audio file
391
- y, sr = librosa.load(input_file, sr=None)
392
-
393
- # Use time stretching to speed up audio without changing pitch
394
- y_stretched = librosa.effects.time_stretch(y, rate=speedup_factor)
395
-
396
- # Save the output with the original sample rate
397
- sf.write(output_file, y_stretched, sr)
398
- # print(f"Speed up by {speedup_factor} completed successfully: {output_file}")
399
-
400
- except Exception as e:
401
- gr.Warning(f"Error during speedup with Librosa: {e}")
402
- shutil.copy(input_file, output_file)
403
-
404
-
405
-
406
 
407
- def is_ffmpeg_installed():
408
- if platform.system() == "Windows":
409
- local_ffmpeg_path = os.path.join("./ffmpeg", "ffmpeg.exe")
410
- else:
411
- local_ffmpeg_path = "ffmpeg"
412
- try:
413
- subprocess.run([local_ffmpeg_path, "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
414
- # print("FFmpeg is installed")
415
- return True,local_ffmpeg_path
416
- except (FileNotFoundError, subprocess.CalledProcessError):
417
- # print("FFmpeg is not installed. Using 'librosa' for speedup audio in SRT dubbing")
418
- gr.Warning("FFmpeg is not installed. Using 'librosa' for speedup audio in SRT dubbing",duration= 20)
419
- return False,local_ffmpeg_path
420
-
421
-
422
-
423
-
424
- # ffmpeg -i test.wav -filter:a "atempo=2.0" ffmpeg.wav -y
425
- def change_speed(input_file, output_file, speedup_factor):
426
- global use_ffmpeg,local_ffmpeg_path
427
- if use_ffmpeg:
428
- # print("Using FFmpeg for speedup")
429
- try:
430
- # subprocess.run([
431
- # local_ffmpeg_path,
432
- # "-i", input_file,
433
- # "-filter:a", f"atempo={speedup_factor}",
434
- # output_file,
435
- # "-y"
436
- # ], check=True)
437
- subprocess.run([
438
- local_ffmpeg_path,
439
- "-i", input_file,
440
- "-filter:a", f"atempo={speedup_factor}",
441
- output_file,
442
- "-y"
443
- ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
444
- except Exception as e:
445
- gr.Error(f"Error during speedup with FFmpeg: {e}")
446
- speedup_audio_librosa(input_file, output_file, speedup_factor)
447
- else:
448
- # print("Using Librosa for speedup")
449
- speedup_audio_librosa(input_file, output_file, speedup_factor)
450
-
451
-
452
-
453
-
454
-
455
-
456
-
457
- class SRTDubbing:
458
- def __init__(self):
459
- pass
460
-
461
- @staticmethod
462
- def text_to_speech_srt(text, audio_path, language, actual_duration):
463
- tts_filename = "./cache/temp.wav"
464
- your_tts(text,tts_filename,actual_duration,speed=1.0)
465
- # Check the duration of the generated TTS audio
466
- tts_audio = AudioSegment.from_file(tts_filename)
467
- tts_duration = len(tts_audio)
468
-
469
- if actual_duration == 0:
470
- # If actual duration is zero, use the original TTS audio without modifications
471
- shutil.move(tts_filename, audio_path)
472
- return
473
- # If TTS audio duration is longer than actual duration, speed up the audio
474
- if tts_duration > actual_duration:
475
- speedup_factor = tts_duration / actual_duration
476
- speedup_filename = "./cache/speedup_temp.wav"
477
- change_speed(tts_filename, speedup_filename, speedup_factor)
478
- # Use ffmpeg to change audio speed
479
- # subprocess.run([
480
- # "ffmpeg",
481
- # "-i", tts_filename,
482
- # "-filter:a", f"atempo={speedup_factor}",
483
- # speedup_filename,
484
- # "-y"
485
- # ], check=True)
486
-
487
- # Replace the original TTS audio with the sped-up version
488
- shutil.move(speedup_filename, audio_path)
489
- elif tts_duration < actual_duration:
490
- # If TTS audio duration is less than actual duration, add silence to match the duration
491
- silence_gap = actual_duration - tts_duration
492
- silence = AudioSegment.silent(duration=int(silence_gap))
493
- new_audio = tts_audio + silence
494
-
495
- # Save the new audio with added silence
496
- new_audio.export(audio_path, format="wav")
497
- else:
498
- # If TTS audio duration is equal to actual duration, use the original TTS audio
499
- shutil.move(tts_filename, audio_path)
500
-
501
- @staticmethod
502
- def make_silence(pause_time, pause_save_path):
503
- silence = AudioSegment.silent(duration=pause_time)
504
- silence.export(pause_save_path, format="wav")
505
- return pause_save_path
506
-
507
- @staticmethod
508
- def create_folder_for_srt(srt_file_path):
509
- srt_base_name = os.path.splitext(os.path.basename(srt_file_path))[0]
510
- random_uuid = str(uuid.uuid4())[:4]
511
- dummy_folder_path = f"{base_path}/dummy"
512
- if not os.path.exists(dummy_folder_path):
513
- os.makedirs(dummy_folder_path)
514
- folder_path = os.path.join(dummy_folder_path, f"{srt_base_name}_{random_uuid}")
515
- os.makedirs(folder_path, exist_ok=True)
516
- return folder_path
517
-
518
- @staticmethod
519
- def concatenate_audio_files(audio_paths, output_path):
520
- concatenated_audio = AudioSegment.silent(duration=0)
521
- for audio_path in audio_paths:
522
- audio_segment = AudioSegment.from_file(audio_path)
523
- concatenated_audio += audio_segment
524
- concatenated_audio.export(output_path, format="wav")
525
-
526
- def srt_to_dub(self, srt_file_path,dub_save_path,language='en'):
527
- result = self.read_srt_file(srt_file_path)
528
- new_folder_path = self.create_folder_for_srt(srt_file_path)
529
- join_path = []
530
- for i in tqdm(result):
531
- # for i in result:
532
- text = i['text']
533
- actual_duration = i['end_time'] - i['start_time']
534
- pause_time = i['pause_time']
535
- slient_path = f"{new_folder_path}/{i['previous_pause']}"
536
- self.make_silence(pause_time, slient_path)
537
- join_path.append(slient_path)
538
- tts_path = f"{new_folder_path}/{i['audio_name']}"
539
- self.text_to_speech_srt(text, tts_path, language, actual_duration)
540
- join_path.append(tts_path)
541
- self.concatenate_audio_files(join_path, dub_save_path)
542
-
543
- @staticmethod
544
- def convert_to_millisecond(time_str):
545
- if isinstance(time_str, str):
546
- hours, minutes, second_millisecond = time_str.split(':')
547
- seconds, milliseconds = second_millisecond.split(",")
548
-
549
- total_milliseconds = (
550
- int(hours) * 3600000 +
551
- int(minutes) * 60000 +
552
- int(seconds) * 1000 +
553
- int(milliseconds)
554
- )
555
-
556
- return total_milliseconds
557
- @staticmethod
558
- def read_srt_file(file_path):
559
- entries = []
560
- default_start = 0
561
- previous_end_time = default_start
562
- entry_number = 1
563
- audio_name_template = "{}.wav"
564
- previous_pause_template = "{}_before_pause.wav"
565
-
566
- with open(file_path, 'r', encoding='utf-8') as file:
567
- lines = file.readlines()
568
- # print(lines)
569
- for i in range(0, len(lines), 4):
570
- time_info = re.findall(r'(\d+:\d+:\d+,\d+) --> (\d+:\d+:\d+,\d+)', lines[i + 1])
571
- start_time = SRTDubbing.convert_to_millisecond(time_info[0][0])
572
- end_time = SRTDubbing.convert_to_millisecond(time_info[0][1])
573
-
574
- current_entry = {
575
- 'entry_number': entry_number,
576
- 'start_time': start_time,
577
- 'end_time': end_time,
578
- 'text': lines[i + 2].strip(),
579
- 'pause_time': start_time - previous_end_time if entry_number != 1 else start_time - default_start,
580
- 'audio_name': audio_name_template.format(entry_number),
581
- 'previous_pause': previous_pause_template.format(entry_number),
582
- }
583
-
584
- entries.append(current_entry)
585
- previous_end_time = end_time
586
- entry_number += 1
587
-
588
- with open("entries.json", "w") as file:
589
- json.dump(entries, file, indent=4)
590
- return entries
591
- srt_voice_name="af_bella"
592
- use_ffmpeg,local_ffmpeg_path = is_ffmpeg_installed()
593
- # use_ffmpeg=False
594
-
595
- def srt_process(srt_file_path,voice_name,custom_voicepack=None,dest_language="en"):
596
- global srt_voice_name,use_ffmpeg
597
-
598
- if not srt_file_path.endswith(".srt"):
599
- gr.Error("Please upload a valid .srt file",duration=5)
600
- return None
601
- if use_ffmpeg:
602
- gr.Success("Using FFmpeg for audio speedup to sync with subtitle")
603
- else:
604
- gr.Warning("Install FFmpeg to ensure high-quality audio when speeding up the audio to sync with subtitle. Default Using 'librosa' for speedup",duration= 20)
605
-
606
- if custom_voicepack:
607
- if manage_files(custom_voicepack):
608
- srt_voice_name = custom_voicepack
609
- else:
610
- srt_voice_name=voice_name
611
- gr.Warning("Upload small size .pt file only. Using the Current voice pack instead.")
612
- else:
613
- srt_voice_name=voice_name
614
- srt_dubbing = SRTDubbing()
615
- dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
616
- srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
617
- return dub_save_path
618
-
619
- #
620
- # srt_file_path="./long.srt"
621
- # dub_audio_path=srt_process(srt_file_path)
622
- # print(f"Audio file saved at: {dub_audio_path}")
623
-
624
-
625
-
626
- with gr.Blocks() as demo3:
627
-
628
- gr.Markdown(
629
- """
630
- # Generate Audio File From Subtitle [Upload Only .srt file]
631
-
632
- To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
633
-
634
- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NeuralFalconYT/Whisper-Turbo-Subtitle/blob/main/Whisper_Turbo_Subtitle.ipynb)
635
- """
636
- )
637
  with gr.Row():
638
  with gr.Column():
639
- srt_file = gr.File(label='Upload .srt Subtitle File Only')
640
  with gr.Row():
641
- voice = gr.Dropdown(
642
- voice_list,
643
- value='af_bella',
644
- allow_custom_value=False,
645
- label='Voice',
646
- )
647
  with gr.Row():
648
- generate_btn_ = gr.Button('Generate', variant='primary')
649
-
650
  with gr.Accordion('Audio Settings', open=False):
651
- custom_voicepack = gr.File(label='Upload Custom VoicePack .pt file')
 
 
 
 
 
 
 
652
 
653
-
654
-
655
  with gr.Column():
656
  audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
657
- with gr.Accordion('Enable Autoplay', open=False):
658
  autoplay = gr.Checkbox(value=True, label='Autoplay')
659
  autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
660
 
661
- # srt_file.submit(
662
- # srt_process,
663
- # inputs=[srt_file, voice],
664
- # outputs=[audio]
665
- # )
666
- generate_btn_.click(
667
- srt_process,
668
- inputs=[srt_file,voice,custom_voicepack],
669
- outputs=[audio]
670
- )
671
 
672
-
673
-
674
- #### Voice mixing
675
- # modified from here
676
- # https://huggingface.co/spaces/ysharma/Make_Custom_Voices_With_KokoroTTS
677
- def get_voices():
678
- voices = {}
679
- for i in os.listdir("./KOKORO/voices"):
680
- if i.endswith(".pt"):
681
- voice_name = i.replace(".pt", "")
682
- voices[voice_name] = torch.load(f"./KOKORO/voices/{i}", weights_only=True).to(device)
683
-
684
- slider_configs = {}
685
-
686
- # Iterate through the predefined list of voices
687
- for i in voices:
688
- # Handle the default case for "af"
689
- if i == "af":
690
- slider_configs["af"]= "Default 👩🇺🇸"
691
- continue
692
- if i == "af_nicole":
693
- slider_configs["af_nicole"]="Nicole 😏🇺🇸"
694
- continue
695
- if i == "af_bella":
696
- slider_configs["af_bella"]="Bella 🤗🇺🇸"
697
- continue
698
-
699
- # Determine the country emoji
700
- country = "🇺🇸" if i.startswith("a") else "🇬🇧"
701
-
702
- # Determine the gender emoji and name
703
- if "f_" in i:
704
- display_name = f"{i.split('_')[-1].capitalize()} 👩{country}"
705
- elif "m_" in i:
706
- display_name = f"{i.split('_')[-1].capitalize()} 👨{country}"
707
- else:
708
- display_name = f"{i.capitalize()} 😐"
709
-
710
- # Append the voice tuple to the list
711
- slider_configs[i]= display_name
712
-
713
- return voices, slider_configs
714
-
715
- voices, slider_configs = get_voices()
716
-
717
-
718
- def parse_voice_formula(formula):
719
- global voices
720
- """Parse the voice formula string and return the combined voice tensor."""
721
- if not formula.strip():
722
- raise ValueError("Empty voice formula")
723
-
724
- # Initialize the weighted sum
725
- weighted_sum = None
726
-
727
- # Split the formula into terms
728
- terms = formula.split('+')
729
- weights=0
730
- for term in terms:
731
- # Parse each term (format: "voice_name * 0.333")
732
- parts = term.strip().split('*')
733
- if len(parts) != 2:
734
- raise ValueError(f"Invalid term format: {term.strip()}. Should be 'voice_name * weight'")
735
-
736
- voice_name = parts[0].strip()
737
- weight = float(parts[1].strip())
738
- weights+=weight
739
- # print(voice_name)
740
- # print(weight)
741
- # Get the voice tensor
742
- if voice_name not in voices:
743
- raise ValueError(f"Unknown voice: {voice_name}")
744
-
745
- voice_tensor = voices[voice_name]
746
-
747
- # Add to weighted sum
748
- if weighted_sum is None:
749
- weighted_sum = weight * voice_tensor
750
- else:
751
- weighted_sum += weight * voice_tensor
752
- return weighted_sum/weights
753
-
754
-
755
-
756
-
757
-
758
-
759
-
760
- def get_new_voice(formula):
761
- # print(formula)
762
- try:
763
- # Parse the formula and get the combined voice tensor
764
- weighted_voices = parse_voice_formula(formula)
765
- voice_pack_name = "./weighted_normalised_voices.pt"
766
- # Save and load the combined voice
767
- torch.save(weighted_voices, voice_pack_name)
768
- # print(f"Voice pack saved at: {voice_pack_name}")
769
- return voice_pack_name
770
- except Exception as e:
771
- raise gr.Error(f"Failed to create voice: {str(e)}")
772
-
773
-
774
- def generate_voice_formula(*values):
775
- """
776
- Generate a formatted string showing the normalized voice combination.
777
- Returns: String like "0.6 * voice1" or "0.4 * voice1 + 0.6 * voice2"
778
- """
779
- n = len(values) // 2
780
- checkbox_values = values[:n]
781
- slider_values = list(values[n:])
782
- global slider_configs
783
- # Get active sliders and their names
784
- active_pairs = [(slider_values[i], slider_configs[i][0])
785
- for i in range(len(slider_configs))
786
- if checkbox_values[i]]
787
-
788
- if not active_pairs:
789
- return ""
790
-
791
- # If only one voice is selected, use its actual value
792
- if len(active_pairs) == 1:
793
- value, name = active_pairs[0]
794
- return f"{value:.3f} * {name}"
795
-
796
- # Calculate sum for normalization of multiple voices
797
- total_sum = sum(value for value, _ in active_pairs)
798
-
799
- if total_sum == 0:
800
- return ""
801
-
802
- # Generate normalized formula for multiple voices
803
- terms = []
804
- for value, name in active_pairs:
805
- normalized_value = value / total_sum
806
- terms.append(f"{normalized_value:.3f} * {name}")
807
-
808
- return " + ".join(terms)
809
-
810
-
811
-
812
-
813
-
814
- def create_voice_mix_ui():
815
- with gr.Blocks() as demo:
816
- gr.Markdown(
817
- """
818
- # Kokoro Voice Mixer
819
- Select voices and adjust their weights to create a mixed voice.
820
- """
821
- )
822
-
823
- voice_components = {}
824
- voice_names = list(voices.keys())
825
- female_voices = [name for name in voice_names if "f_" in name]
826
- male_voices = [name for name in voice_names if "b_" in name]
827
- neutral_voices = [name for name in voice_names if "f_" not in name and "b_" not in name]
828
-
829
- # Define how many columns you want
830
- num_columns = 3
831
-
832
- # Function to generate UI
833
- def generate_ui_row(voice_list):
834
- num_voices = len(voice_list)
835
- num_rows = (num_voices + num_columns - 1) // num_columns
836
- for i in range(num_rows):
837
- with gr.Row():
838
- for j in range(num_columns):
839
- index = i * num_columns + j
840
- if index < num_voices:
841
- voice_name = voice_list[index]
842
- with gr.Column():
843
- checkbox = gr.Checkbox(label=slider_configs[voice_name])
844
- weight_slider = gr.Slider(
845
- minimum=0,
846
- maximum=1,
847
- value=1.0,
848
- step=0.01,
849
- interactive=False
850
- )
851
- voice_components[voice_name] = (checkbox, weight_slider)
852
- checkbox.change(
853
- lambda x, slider=weight_slider: gr.update(interactive=x),
854
- inputs=[checkbox],
855
- outputs=[weight_slider]
856
- )
857
-
858
- generate_ui_row(female_voices)
859
- generate_ui_row(male_voices)
860
- generate_ui_row(neutral_voices)
861
-
862
- formula_inputs = []
863
- for i in voice_components:
864
- checkbox, slider = voice_components[i]
865
- formula_inputs.append(checkbox)
866
- formula_inputs.append(slider)
867
-
868
- with gr.Row():
869
- voice_formula = gr.Textbox(label="Voice Formula", interactive=False)
870
-
871
- # Function to dynamically update the voice formula
872
- def update_voice_formula(*args):
873
- formula_parts = []
874
- for i, (checkbox, slider) in enumerate(voice_components.values()):
875
- if args[i * 2]: # If checkbox is selected
876
- formula_parts.append(f"{list(voice_components.keys())[i]} * {args[i * 2 + 1]:.3f}")
877
- return " + ".join(formula_parts)
878
-
879
-
880
- # Update formula whenever any checkbox or slider changes
881
- for checkbox, slider in voice_components.values():
882
- checkbox.change(
883
- update_voice_formula,
884
- inputs=formula_inputs,
885
- outputs=[voice_formula]
886
- )
887
- slider.change(
888
- update_voice_formula,
889
- inputs=formula_inputs,
890
- outputs=[voice_formula]
891
- )
892
-
893
- with gr.Row():
894
- voice_text = gr.Textbox(
895
- label='Enter Text',
896
- lines=3,
897
- placeholder="Type your text here to preview the custom voice..."
898
- )
899
- voice_generator = gr.Button('Generate', variant='primary')
900
- with gr.Accordion('Audio Settings', open=False):
901
- model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
902
- speed = gr.Slider(
903
- minimum=0.25, maximum=2, value=1, step=0.1,
904
- label='⚡️Speed', info='Adjust the speaking speed'
905
- )
906
- remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
907
- with gr.Row():
908
- voice_audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
909
- with gr.Row():
910
- mix_voice_download = gr.File(label="Download VoicePack")
911
- with gr.Accordion('Enable Autoplay', open=False):
912
- autoplay = gr.Checkbox(value=True, label='Autoplay')
913
- autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[voice_audio])
914
- def generate_custom_audio(text_input, formula_text, model_name, speed, remove_silence):
915
- try:
916
- new_voice_pack = get_new_voice(formula_text)
917
- audio_output_path =text_to_speech(text=text_input, model_name=model_name, voice_name="af", speed=speed, pad_between_segments=0, remove_silence=remove_silence, minimum_silence=0.05,custom_voicepack=new_voice_pack,trim=0.0)
918
- # audio_output_path = text_to_speech(text=text_input, model_name=model_name,voice_name="af", speed=1.0, custom_voicepack=new_voice_pack)
919
- return audio_output_path,new_voice_pack
920
- except Exception as e:
921
- raise gr.Error(f"Failed to generate audio: {e}")
922
-
923
-
924
- voice_generator.click(
925
- generate_custom_audio,
926
- inputs=[voice_text, voice_formula,model_name,speed,remove_silence],
927
- outputs=[voice_audio,mix_voice_download]
928
- )
929
- return demo
930
-
931
- demo4 = create_voice_mix_ui()
932
-
933
-
934
-
935
-
936
- # display_text = " \n".join(voice_list)
937
-
938
- # with gr.Blocks() as demo5:
939
- # gr.Markdown(f"# Voice Names \n{display_text}")
940
-
941
- #get voice names useful for local api
942
- import os
943
- import json
944
-
945
- def get_voice_names():
946
- male_voices, female_voices, other_voices = [], [], []
947
-
948
- for filename in os.listdir("./KOKORO/voices"):
949
- if filename.endswith('.pt'):
950
- name = os.path.splitext(filename)[0]
951
- if "m_" in name:
952
- male_voices.append(name)
953
- elif name=="af":
954
- female_voices.append(name)
955
- elif "f_" in name:
956
- female_voices.append(name)
957
- else:
958
- other_voices.append(name)
959
-
960
- # Sort the lists by the length of the voice names
961
- male_voices = sorted(male_voices, key=len)
962
- female_voices = sorted(female_voices, key=len)
963
- other_voices = sorted(other_voices, key=len)
964
-
965
- return json.dumps({
966
- "male_voices": male_voices,
967
- "female_voices": female_voices,
968
- "other_voices": other_voices
969
- }, indent=4)
970
-
971
- with gr.Blocks() as demo5:
972
- gr.Markdown(f"# Voice Names")
973
- gr.Markdown("[Install on Windows/Linux](https://github.com/NeuralFalconYT/Kokoro-82M-WebUI)")
974
- get_voice_button = gr.Button("Get Voice Names")
975
- voice_names = gr.Textbox(label="Voice Names", placeholder="Click 'Get Voice Names' to display the list of available voice names", lines=10)
976
- get_voice_button.click(get_voice_names, outputs=[voice_names])
977
-
978
-
979
-
980
-
981
-
982
-
983
- import click
984
- @click.command()
985
- @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
986
- @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
987
- def main(debug, share):
988
- demo = gr.TabbedInterface([demo1, demo2,demo3,demo4,demo5], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Voice Mix","Available Voice Names"],title="Kokoro TTS",theme='JohnSmith9982/small_and_pretty')
989
-
990
- demo.queue().launch(debug=debug, share=share)
991
- #Run on local network
992
- # laptop_ip="192.168.0.30"
993
- # port=8080
994
- # demo.queue().launch(debug=debug, share=share,server_name=laptop_ip,server_port=port)
995
 
996
  if __name__ == "__main__":
997
- main()
998
-
999
-
1000
- ##For client side
1001
- # from gradio_client import Client
1002
- # import shutil
1003
- # import os
1004
- # os.makedirs("temp_audio", exist_ok=True)
1005
- # from gradio_client import Client
1006
- # client = Client("http://127.0.0.1:7860/")
1007
- # result = client.predict(
1008
- # text="Hello!!",
1009
- # model_name="kokoro-v0_19.pth",
1010
- # voice_name="af_bella",
1011
- # speed=1,
1012
- # trim=0,
1013
- # pad_between_segments=0,
1014
- # remove_silence=False,
1015
- # minimum_silence=0.05,
1016
- # api_name="/text_to_speech"
1017
- # )
1018
-
1019
- # save_at=f"./temp_audio/{os.path.basename(result)}"
1020
- # shutil.move(result, save_at)
1021
- # print(f"Saved at {save_at}")
 
 
1
  from KOKORO.models import build_model
2
+ from KOKORO.utils import tts,tts_file_name
3
  import sys
4
  sys.path.append('.')
5
  import os
 
7
  import torch
8
  import gc
9
  import platform
10
+ import gradio as gr
11
  import shutil
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # Initialize model
14
  print("Loading model...")
15
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
16
  print(f'Using device: {device}')
17
  MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
18
  print("Model loaded successfully.")
19
 
20
+ # Get list of available voices
21
+ voice_list = [
22
+ os.path.splitext(filename)[0]
23
+ for filename in os.listdir("./KOKORO/voices")
24
+ if filename.endswith('.pt')
25
+ ]
26
+ voice_list = sorted(voice_list, key=len)
27
 
28
  model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
29
  current_model = model_list[0]
30
 
31
  def update_model(model_name):
32
+ """Updates the TTS model only if the specified model is not already loaded."""
 
 
33
  global MODEL, current_model
34
  if current_model == model_name:
35
+ return f"Model already set to {model_name}"
36
+ model_path = f"./KOKORO/{model_name}"
37
  if model_name == "kokoro-v0_19-half.pth":
38
+ model_path = f"./KOKORO/fp16/{model_name}"
39
+ del MODEL
 
40
  gc.collect()
41
+ torch.cuda.empty_cache()
42
  MODEL = build_model(model_path, device)
43
  current_model = model_name
44
  return f"Model updated to {model_name}"
45
 
 
46
  def manage_files(file_path):
47
+ """Validates uploaded voicepack files."""
48
  if os.path.exists(file_path):
49
+ file_extension = os.path.splitext(file_path)[1]
50
+ file_size = os.path.getsize(file_path)
 
51
  if file_extension == ".pt" and file_size <= 5 * 1024 * 1024:
52
+ return True
53
  else:
54
+ os.remove(file_path)
55
  return False
56
+ return False
57
 
58
+ def text_to_speech(text, model_name="kokoro-v0_19.pth", voice_name="af", speed=1.0,
59
+ pad_between_segments=0, remove_silence=True, minimum_silence=0.20,
60
+ custom_voicepack=None, trim=0.0):
61
+ """Converts text to speech using specified parameters."""
62
+ update_model(model_name)
 
 
 
63
  if not minimum_silence:
64
  minimum_silence = 0.05
 
65
  save_at = tts_file_name(text)
66
+
67
  if custom_voicepack:
68
  if manage_files(custom_voicepack):
69
  voice_name = custom_voicepack
70
  else:
71
+ gr.Warning("Invalid voicepack file. Using default voice instead.")
72
+
73
+ audio_path = tts(MODEL, device, text, voice_name, speed=speed, trim=trim,
74
+ pad_between_segments=pad_between_segments, output_file=save_at,
75
+ remove_silence=remove_silence, minimum_silence=minimum_silence)
 
 
 
 
 
 
76
  return audio_path
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def toggle_autoplay(autoplay):
79
  return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
80
 
81
+ # Main Gradio interface
82
+ with gr.Blocks() as demo:
83
+ gr.Markdown("# Kokoro TTS - Batched Text-to-Speech")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  with gr.Row():
86
  with gr.Column():
87
+ text = gr.Textbox(label='Enter Text', lines=3, placeholder="Type your text here...")
88
  with gr.Row():
89
+ voice = gr.Dropdown(voice_list, value='af_bella', allow_custom_value=False,
90
+ label='Voice', info='Select a voice')
 
 
 
 
91
  with gr.Row():
92
+ generate_btn = gr.Button('Generate', variant='primary')
93
+
94
  with gr.Accordion('Audio Settings', open=False):
95
+ model_name = gr.Dropdown(model_list, label="Model", value=model_list[0])
96
+ speed = gr.Slider(minimum=0.25, maximum=2, value=1, step=0.1,
97
+ label='Speed', info='Adjust speaking speed')
98
+ remove_silence = gr.Checkbox(value=False, label='Remove Silence')
99
+ minimum_silence = gr.Number(label="Minimum Silence (seconds)", value=0.05)
100
+ pad_between = gr.Slider(minimum=0, maximum=2, value=0, step=0.1,
101
+ label='Pad Between', info='Silent duration between segments')
102
+ custom_voicepack = gr.File(label='Upload Custom VoicePack (.pt file)')
103
 
 
 
104
  with gr.Column():
105
  audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
106
+ with gr.Accordion('Autoplay Settings', open=False):
107
  autoplay = gr.Checkbox(value=True, label='Autoplay')
108
  autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
109
 
110
+ text.submit(text_to_speech,
111
+ inputs=[text, model_name, voice, speed, pad_between, remove_silence,
112
+ minimum_silence, custom_voicepack],
113
+ outputs=[audio])
 
 
 
 
 
 
114
 
115
+ generate_btn.click(text_to_speech,
116
+ inputs=[text, model_name, voice, speed, pad_between, remove_silence,
117
+ minimum_silence, custom_voicepack],
118
+ outputs=[audio])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  if __name__ == "__main__":
121
+ demo.queue().launch()