|
|
|
import gradio as gr |
|
import random |
|
import os |
|
import re |
|
from gradio_client import Client, file |
|
|
|
client = Client(os.environ['src']) |
|
|
|
|
|
BASE_PATH = "Inference" |
|
RU_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "random_texts.txt") |
|
EN_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "english_random_texts.txt") |
|
RU_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "prompt.txt") |
|
EN_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "english_prompt.txt") |
|
|
|
|
|
def load_texts(filepath): |
|
if not os.path.exists(os.path.dirname(filepath)) and os.path.dirname(filepath) != '': |
|
print(f"Warning: Directory '{os.path.dirname(filepath)}' not found.") |
|
return ["Example text file directory not found."] |
|
try: |
|
try: |
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
return [line.strip() for line in f if line.strip()] |
|
except UnicodeDecodeError: |
|
print(f"Warning: UTF-8 decode failed for {filepath}. Trying 'cp1251' (common for Russian)...") |
|
with open(filepath, 'r', encoding='cp1251') as f: |
|
return [line.strip() for line in f if line.strip()] |
|
except FileNotFoundError: |
|
print(f"Warning: File not found - {filepath}") |
|
if "english" in filepath and "random" in filepath: |
|
return ["Example English text file not found."] |
|
elif "random" in filepath: |
|
return ["Пример русского текстового файла не найден."] |
|
elif "english" in filepath and "prompt" in filepath: |
|
return ["Speaker: Example English prompt file not found."] |
|
elif "prompt" in filepath: |
|
return ["Диктор: Пример русского файла подсказок не найден."] |
|
else: |
|
return ["Example text file not found."] |
|
except Exception as e: |
|
print(f"Error loading {filepath}: {e}") |
|
return ["Error loading example texts."] |
|
|
|
ru_random_texts_list = load_texts(RU_RANDOM_TEXTS_PATH) |
|
en_random_texts_list = load_texts(EN_RANDOM_TEXTS_PATH) |
|
ru_prompt_texts_list = load_texts(RU_PROMPT_TEXTS_PATH) |
|
en_prompt_texts_list = load_texts(EN_PROMPT_TEXTS_PATH) |
|
|
|
def create_example_dict(text_list): |
|
if not text_list or not isinstance(text_list[0], str): |
|
return {"No examples found": ""} |
|
return {f"{text[:30]}...": text for text in text_list} |
|
|
|
ru_prompt_examples = create_example_dict(ru_prompt_texts_list) |
|
en_prompt_examples = create_example_dict(en_prompt_texts_list) |
|
|
|
|
|
VOICE_DIR = "./reference_sample_wavs" |
|
try: |
|
if os.path.exists(VOICE_DIR) and os.path.isdir(VOICE_DIR): |
|
voicelist = sorted([v for v in os.listdir(VOICE_DIR) if os.path.isfile(os.path.join(VOICE_DIR, v)) and v.lower().endswith(('.wav', '.mp3', '.flac'))]) |
|
if not voicelist: |
|
print(f"Warning: No compatible audio files found in {VOICE_DIR}. Dropdown will be empty.") |
|
voicelist = ["default.wav"] |
|
else: |
|
print(f"Warning: Voice directory not found or is not a directory: {VOICE_DIR}. Using placeholder list.") |
|
voicelist = ["anna_studio.wav", "boris_clear.wav", "female_neutral.wav", "male_deep.wav"] |
|
except Exception as e: |
|
print(f"Error listing voices in {VOICE_DIR}: {e}") |
|
voicelist = ["error_loading_voices"] |
|
|
|
|
|
def update_text_input_longform(preview_key, is_english): |
|
examples_dict = en_prompt_examples if is_english else ru_prompt_examples |
|
if preview_key in examples_dict: |
|
return examples_dict[preview_key] |
|
elif examples_dict: |
|
return list(examples_dict.values())[0] |
|
else: |
|
return "Selected example not found or examples failed to load." |
|
|
|
|
|
def generate_random_spk(is_english): |
|
if is_english: |
|
rand_id = random.randint(0, 3250) |
|
print(f"Generated random English Speaker ID: {rand_id}") |
|
return rand_id |
|
else: |
|
rand_id = random.randint(0, 196) |
|
print(f"Generated random Russian Speaker ID: {rand_id}") |
|
return rand_id |
|
|
|
|
|
def Client_Synthesize_Audio(text, voice, voice2_path, spk_id, vcsteps, embscale, beta, ros, t, language_checkbox): |
|
print("--- Client: Calling Synthesize_Audio ---") |
|
print(f"Text: {text[:50]}...") |
|
print(f"Default Voice: {voice}") |
|
print(f"Uploaded Voice Path: {voice2_path}") |
|
print(f"Speaker ID: {spk_id}") |
|
print(f"Steps: {vcsteps}, Scale: {embscale}, Beta: {beta}, RoS: {ros}, T: {t}") |
|
print(f"English Mode: {language_checkbox}") |
|
|
|
voice2_arg = voice2_path |
|
|
|
try: |
|
if isinstance(client, DummyClient): |
|
raise ConnectionError("Gradio client not connected.") |
|
|
|
result = client.predict( |
|
text, |
|
voice, |
|
voice2_arg, |
|
spk_id, |
|
vcsteps, |
|
embscale, |
|
beta, |
|
ros, |
|
t, |
|
language_checkbox, |
|
api_name="/Synthesize_Audio" |
|
) |
|
print("--- Client: Synthesize_Audio call successful ---") |
|
return result |
|
except Exception as e: |
|
print(f"--- Client: Error calling Synthesize_Audio: {e} ---") |
|
import numpy as np |
|
return (44100, np.zeros(1)) |
|
|
|
def Client_PromptedSynth_Text(text, beta, t, diffusion_steps, embedding_scale, ros, language_checkbox): |
|
print("--- Client: Calling PromptedSynth_Text ---") |
|
print(f"Text: {text[:50]}...") |
|
print(f"Beta: {beta}, T: {t}, Steps: {diffusion_steps}, Scale: {embedding_scale}, RoS: {ros}") |
|
print(f"English Mode: {language_checkbox}") |
|
|
|
try: |
|
if isinstance(client, DummyClient): |
|
raise ConnectionError("Gradio client not connected.") |
|
|
|
result = client.predict( |
|
text, |
|
beta, |
|
t, |
|
diffusion_steps, |
|
embedding_scale, |
|
ros, |
|
language_checkbox, |
|
api_name="/PromptedSynth_Text" |
|
) |
|
print("--- Client: PromptedSynth_Text call successful ---") |
|
return result |
|
except Exception as e: |
|
print(f"--- Client: Error calling PromptedSynth_Text: {e} ---") |
|
import numpy as np |
|
return (44100, np.zeros(1)) |
|
|
|
|
|
INTROTXT = """# |
|
Demo for The Poor Man's TTS, this is run on a single RTX 3090. |
|
Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/Project_Kanade_SpeechModel) |
|
**Check the Tips and Model Details tabs below.** <br> |
|
Enjoy! |
|
""" |
|
|
|
|
|
with gr.Blocks() as audio_inf: |
|
gr.Markdown("### Synthesize speech using a reference audio clip (default, uploaded, or from speaker ID).") |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
language_checkbox_audio = gr.Checkbox(label="English?", value=False, |
|
info="Tick for English synthesis, leave unchecked for Russian.") |
|
inp = gr.Textbox(label="Text", |
|
info="Enter the text for voice-guided synthesis.", |
|
value=ru_random_texts_list[0], |
|
interactive=True, |
|
scale=5) |
|
|
|
voice = gr.Dropdown(choices=voicelist, |
|
label="Default Reference Voice", |
|
info="Select a pre-defined reference voice.", |
|
value=voicelist[0] if voicelist else None, |
|
interactive=True) |
|
voice_2 = gr.Audio(label="Upload Your Audio Reference (Overrides Default Voice & Speaker ID)", |
|
sources=["upload", "microphone"], |
|
interactive=True, |
|
type='filepath', |
|
info="Upload a short (5-15s) clear audio clip.", |
|
waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'}) |
|
spk_id = gr.Number(label="Speaker ID (Alternative Reference)", |
|
info="Input speaker ID (max 196 Ru / 3250 En) to use a random sample from that speaker on the server. 9999 disables.", |
|
value=9999, |
|
interactive=True) |
|
|
|
random_spk_btn = gr.Button("Random") |
|
|
|
|
|
with gr.Accordion("Advanced Parameters", open=False): |
|
beta = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1, |
|
label="Beta (Style Strength vs. Reference)", |
|
info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.", |
|
interactive=True) |
|
multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1, |
|
label="Diffusion Steps", |
|
info="More steps can improve quality but increase inference time.", |
|
interactive=True) |
|
embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1, |
|
label="Embedding Scale (Intensity)", |
|
info="Impacts expressiveness. High values (> 1.5) might cause artifacts.", |
|
interactive=True) |
|
rate_of_speech = gr.Slider(minimum=0.5, maximum=2, |
|
value=1, |
|
step=0.1, |
|
label="Rate of Speech", |
|
info="Adjusts speech speed. 1.0 is normal.", |
|
interactive=True) |
|
|
|
t = gr.Slider(minimum=0.1, maximum=2, value=1.0, step=0.1, |
|
label="T (Duration / Temperature)", |
|
info="Controls duration scaling and randomness (T primarily affects English).", |
|
interactive=True) |
|
|
|
with gr.Column(scale=1): |
|
btn = gr.Button("Synthesize (Voice Guided)", variant="primary") |
|
audio = gr.Audio(interactive=False, |
|
label="Synthesized Audio", |
|
waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'}) |
|
|
|
|
|
def update_audio_inf_defaults(is_english): |
|
new_text_value = en_random_texts_list[0] if is_english else ru_random_texts_list[0] |
|
new_spk_info = "Input speaker ID (max 3250 En) or use Randomize. 9999 disables." if is_english else "Input speaker ID (max 196 Ru) or use Randomize. 9999 disables." |
|
new_spk_val = 9999 |
|
return gr.update(value=new_text_value), gr.update(info=new_spk_info, value=new_spk_val) |
|
|
|
|
|
language_checkbox_audio.change(update_audio_inf_defaults, |
|
inputs=[language_checkbox_audio], |
|
outputs=[inp, spk_id]) |
|
|
|
random_spk_btn.click(fn=generate_random_spk, inputs=[language_checkbox_audio], outputs=spk_id) |
|
|
|
btn.click(Client_Synthesize_Audio, |
|
inputs=[inp, voice, voice_2, spk_id, multispeakersteps, embscale, beta, rate_of_speech, t, language_checkbox_audio], |
|
outputs=[audio], |
|
concurrency_limit=4) |
|
|
|
|
|
with gr.Blocks() as longform: |
|
gr.Markdown("### Synthesize speech using the text content itself to guide the style (semantic prompting).") |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
language_checkbox_longform = gr.Checkbox(label="English?", value=False, |
|
info="Tick for English synthesis, leave unchecked for Russian.") |
|
inp_longform = gr.Textbox(label="Text", |
|
info="Enter text; check the format from the examples.", |
|
value=ru_prompt_texts_list[0], |
|
lines=5, |
|
interactive=True, |
|
scale=5) |
|
|
|
with gr.Row(): |
|
example_dropdown = gr.Dropdown(choices=list(ru_prompt_examples.keys()), |
|
label="Example Prompts", |
|
info="Select an example to load into the text box.", |
|
value=list(ru_prompt_examples.keys())[0] if ru_prompt_examples else None, |
|
interactive=True) |
|
|
|
with gr.Accordion("Advanced Parameters", open=False): |
|
beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1, |
|
label="Beta (Style Strength vs. Semantic Prompt)", |
|
info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.", |
|
interactive=True) |
|
diffusion_steps_longform = gr.Slider(minimum=3, maximum=15, value=5, step=1, |
|
label="Diffusion Steps", |
|
info="More steps can improve quality but increase inference time.", |
|
interactive=True) |
|
embedding_scale_longform = gr.Slider(minimum=1, maximum=5, value=1, step=0.1, |
|
label="Embedding Scale (Intensity)", |
|
info="Impacts expressiveness. High values (> 1.5) might cause artifacts.", |
|
interactive=True) |
|
rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, |
|
label="Rate of Speech", |
|
info="Adjusts speech speed. 1.0 is normal.", |
|
interactive=True) |
|
t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1, |
|
label="T (Style Consistency - Primarily English)", |
|
info="Controls the influence of previous sentences' style on the current one.", |
|
interactive=True) |
|
|
|
|
|
with gr.Column(scale=1): |
|
btn_longform = gr.Button("Synthesize (Text Guided)", variant="primary") |
|
audio_longform = gr.Audio(interactive=False, |
|
label="Synthesized Audio", |
|
waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'}) |
|
|
|
|
|
def update_longform_defaults(is_english): |
|
examples_dict = en_prompt_examples if is_english else ru_prompt_examples |
|
new_choices = list(examples_dict.keys()) |
|
new_value = new_choices[0] if new_choices else None |
|
new_text_value = examples_dict.get(new_value, list(examples_dict.values())[0] if examples_dict else ("Speaker: Example text." if is_english else "Диктор: Пример текста.")) |
|
|
|
return gr.update(choices=new_choices, value=new_value), gr.update(value=new_text_value) |
|
|
|
language_checkbox_longform.change(update_longform_defaults, |
|
inputs=[language_checkbox_longform], |
|
outputs=[example_dropdown, inp_longform]) |
|
|
|
example_dropdown.change(fn=update_text_input_longform, |
|
inputs=[example_dropdown, language_checkbox_longform], |
|
outputs=[inp_longform]) |
|
|
|
btn_longform.click(Client_PromptedSynth_Text, |
|
inputs=[inp_longform, |
|
beta_longform, |
|
t_longform, |
|
diffusion_steps_longform, |
|
embedding_scale_longform, |
|
rate_of_speech_longform, |
|
language_checkbox_longform], |
|
outputs=[audio_longform], |
|
concurrency_limit=4) |
|
|
|
|
|
user_guide_text = f""" |
|
## Quick Notes: |
|
|
|
Everything in this demo & the repo (coming soon) is experimental. The main idea is just playing around with different things to see what works when you're limited to training on a pair of RTX 3090s. |
|
|
|
The data used for the english model is rough and pretty tough for any TTS model (think debates, real conversations, plus a little bit of cleaner professional performances). It mostly comes from public sources or third parties (no TOS signed). I'll probably write a blog post later with more details. |
|
|
|
So far I focused on English and Russian, more can be covered. |
|
|
|
--- |
|
|
|
### Voice-Guided Tab (Using Audio Reference) |
|
|
|
* **Options:** |
|
* **Default Voices:** Pick one from the dropdown (these are stored locally). |
|
* **Upload Audio: ** While the data isn't nearly enough for zero-shotting, you can still test your own samples. make sure to decrease the beta if it didn't sound similar. |
|
* **Speaker ID:** Use a number (RU: 0-196, EN: 0-3250) to grab a random clip of that speaker from the server's dataset. Hit 'Randomize' to explore. (Invalid IDs use a default voice on the server). |
|
* **Some notes:** |
|
* **Not all speakers are equal.** Randomized samples might give you a poor reference sometimes. |
|
* **Play with Beta:** Values from 0.2 to 0.9 can work well. Higher Beta = LESS like the reference. It works great for some voices, breaks others. please play with different values. (0 = diffusion off). |
|
|
|
--- |
|
|
|
### Text-Guided Tab (Using Text Meaning) |
|
|
|
* **Intuition:** Figure out the voice style just from the text itself (using semantic encoders). No audio needed, which makes suitable for real-time use cases. |
|
* **Speaker Prefix:** For Russian, you can use 'Speaker_ + number:'. as for the English, you can use any names. names were randomly assigned during the training of the Encoder. |
|
|
|
--- |
|
|
|
### General Tips |
|
|
|
* Punctuation matters for intonation; don't use unsupported symbols. |
|
""" |
|
|
|
with gr.Blocks() as info_tab: |
|
gr.Markdown(user_guide_text) |
|
|
|
|
|
model_details_text = """ |
|
## Model Details (The Guts) |
|
|
|
|
|
--- |
|
|
|
### Darya (Russian Model) - More Stable |
|
|
|
* Generally more controlled than the English one. that's also why in terms of acoustic quality it should sound much better. |
|
* **Setup:** Non-End-to-End (separate steps). |
|
* **Components:** |
|
* Style Encoder: Conformer-based. |
|
* Duration Predictor: Conformer-based (with cross-attention). |
|
* Semantic Encoder: `RuModernBERT-base` (for text-guidance). |
|
* Diffusion Sampler: **None currently.** |
|
* **Vocoder:** [RiFornet](https://github.com/Respaired/RiFornet_Vocoder) |
|
* **Training:** ~200K steps on ~320 hours of Russian data (mix of conversation & narration, hundreds of speakers). |
|
* **Size:** Lightweight (~< 200M params). |
|
* **Specs:** 44.1kHz output, 128 mel bins. |
|
|
|
--- |
|
|
|
### Kalliope (English Model) - Wild |
|
|
|
* **Overall Vibe:** More expressive potential, but also less predictable. Showed signs of overfitting on the noisy data. |
|
* **Setup:** Non-End-to-End. |
|
* **Components:** |
|
* Style Encoder: Conformer-based. |
|
* Text Encoder: `ConvNextV2`. |
|
* Duration Predictor: Conformer-based (with cross-attention). |
|
* Acoustic Decoder: Conformer-based. |
|
* Semantic Encoder: `DeBERTa V3 Base` (for text-guided). |
|
* Diffusion Sampler: **Yes** |
|
* **Vocoder:** [RiFornet](https://github.com/Respaired/RiFornet_Vocoder). |
|
* **Training:** ~100K steps on ~300-400 hours of *very complex & noisy* English data (conversational, whisper, narration, wide emotion range). |
|
* **Size:** Bigger (~1.2B params total, but not all active at once - training was surprisingly doable). Hidden dim 1024, Style vector 512. |
|
* **Specs:** 44.1kHz output, 128 mel bins (but more than half the dataset were 22-24khz or even phone-call quality) |
|
|
|
--- |
|
|
|
*More details might show up in a blog post later.* |
|
""" |
|
|
|
with gr.Blocks() as model_details_tab: |
|
gr.Markdown(model_details_text) |
|
|
|
|
|
theme = gr.themes.Base( |
|
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'], |
|
) |
|
|
|
app = gr.TabbedInterface( |
|
[longform, audio_inf, info_tab, model_details_tab], |
|
['Text-guided Synthesis', 'Voice-guided Synthesis', 'Intuition & Tips', 'Model Details'], |
|
title="The Poor Man's TTS (Experimental)", |
|
theme="Respair/[email protected]" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
print("Launching Client Gradio App...") |
|
app.queue(api_open=False, max_size=15).launch(show_api=False, share=True) |