Spaces:
Runtime error
Runtime error
File size: 5,246 Bytes
5ca847f db3663c 5ca847f aa93b1b 5ca847f 6b12cc3 5ca847f aa93b1b 5ca847f aa93b1b 5ca847f db3663c 6b12cc3 db3663c 5ca847f 51c71fc 5ca847f 4931874 5ca847f db3663c 5ca847f db3663c 5ca847f db3663c 5ca847f db3663c 783ad44 6dfe6e8 937d301 6dfe6e8 db3663c 5a12fa3 6b12cc3 db3663c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import io
import os
import tempfile
from typing import List
import TTS.api
import TTS.utils.manage as manage
import torch
from pydub import AudioSegment
import gradio as gr
import config
device = "cuda" if torch.cuda.is_available() else "cpu"
def ask_tos_patch(self, output_path):
print("Automatically accepting the terms of service.")
return True
manage.ModelManager.ask_tos = ask_tos_patch
tts = TTS.api.TTS()
models = {}
for id, model in config.models.items():
tts.download_model_by_name(model)
models[id] = TTS.api.TTS(model).to(device)
def synthesize_tts(
text: str = 'Hello, World!',
speaker_wavs: List[gr.File] = None,
speaker_idx: str = 'Ana Florence',
language: str = 'ja',
temperature: float = 0.65,
length_penalty: float = 1.0,
repetition_penalty: float = 1.9,
top_k: int = 50,
top_p: float = 0.8,
speed: float = 1.0,
enable_text_splitting: bool = True,
):
temp_files = []
try:
if speaker_wavs:
for speaker_wav in speaker_wavs:
with open(speaker_wav.name, "rb") as f:
speaker_wav_bytes = f.read()
try:
audio = AudioSegment.from_file(io.BytesIO(speaker_wav_bytes))
wav_buffer = io.BytesIO()
audio.export(wav_buffer, format="wav")
wav_buffer.seek(0)
except Exception as e:
return f"Error processing audio file: {e}"
temp_wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
temp_wav_file.write(wav_buffer.read())
temp_wav_file.close()
temp_files.append(temp_wav_file.name)
output_buffer = io.BytesIO()
if temp_files:
models['multi'].tts_to_file(
text=text,
speaker_wav=temp_files,
language=language,
file_path=output_buffer,
temperature=temperature,
length_penalty=length_penalty,
repetition_penalty=repetition_penalty,
top_k=top_k,
top_p=top_p,
speed=speed,
enable_text_splitting=enable_text_splitting
)
else:
models['multi'].tts_to_file(
text=text,
speaker=speaker_idx,
language=language,
file_path=output_buffer,
temperature=temperature,
length_penalty=length_penalty,
repetition_penalty=repetition_penalty,
top_k=top_k,
top_p=top_p,
speed=speed,
enable_text_splitting=enable_text_splitting
)
output_buffer.seek(0)
return output_buffer.read()
finally:
for temp_file in temp_files:
if isinstance(temp_file, str) and os.path.exists(temp_file):
os.remove(temp_file)
inputs = [
gr.Textbox(value="Hello, World!", label="Text to Synthesize"),
gr.File(file_types=["audio"], label="Speaker WAV files (optional)", file_count="multiple"),
gr.Dropdown(
choices=[
"Claribel Dervla", "Daisy Studious", "Gracie Wise", "Tammie Ema", "Alison Dietlinde", "Ana Florence",
"Annmarie Nele", "Asya Anara", "Brenda Stern", "Gitta Nikolina", "Henriette Usha", "Sofia Hellen",
"Tammy Grit", "Tanja Adelina", "Vjollca Johnnie", "Andrew Chipper", "Badr Odhiambo", "Dionisio Schuyler",
"Royston Min", "Viktor Eka", "Abrahan Mack", "Adde Michal", "Baldur Sanjin", "Craig Gutsy",
"Damien Black", "Gilberto Mathias", "Ilkin Urbano", "Kazuhiko Atallah", "Ludvig Milivoj", "Suad Qasim",
"Torcull Diarmuid", "Viktor Menelaos", "Zacharie Aimilios", "Nova Hogarth", "Maja Ruoho", "Uta Obando",
"Lidiya Szekeres", "Chandra MacFarland", "Szofi Granger", "Camilla Holmström", "Lilya Stainthorpe",
"Zofija Kendrick", "Narelle Moon", "Barbora MacLean", "Alexandra Hisakawa", "Alma María", "Rosemary Okafor",
"Ige Behringer", "Filip Traverse", "Damjan Chapman", "Wulf Carlevaro", "Aaron Dreschner", "Kumar Dahl",
"Eugenio Mataracı", "Ferran Simen", "Xavier Hayasaka", "Luis Moray", "Marcos Rudaski"
],
value="Ana Florence",
label="Speaker Index"
),
gr.Dropdown(
choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh", "ja", "hu", "ko"],
value="en",
label="Language"
),
gr.Slider(0, 1, value=0.65, step=0.01, label="Temperature"),
gr.Slider(0.5, 2, value=1.0, step=0.1, label="Length Penalty"),
gr.Slider(1.0, 10.0, value=1.9, step=0.1, label="Repetition Penalty"),
gr.Slider(1, 100, value=50, step=1, label="Top-K"),
gr.Slider(0, 1, value=0.8, step=0.01, label="Top-P"),
gr.Slider(0.5, 2, value=1.0, step=0.01, label="Speed"),
gr.Checkbox(value=True, label="Enable Text Splitting")
]
outputs = gr.Audio(label="Generated Speech")
gr.Interface(
fn=synthesize_tts,
inputs=inputs,
outputs=outputs,
title="Text-to-Speech Synthesis with Gradio"
).launch()
|