Update demo.py
Browse files
demo.py
CHANGED
@@ -1,12 +1,53 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
import random
|
3 |
import os
|
4 |
import re
|
5 |
from gradio_client import Client, file
|
6 |
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
BASE_PATH = "Inference"
|
11 |
RU_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "random_texts.txt")
|
12 |
EN_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "english_random_texts.txt")
|
@@ -17,7 +58,9 @@ EN_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "english_prompt.txt")
|
|
17 |
def load_texts(filepath):
|
18 |
if not os.path.exists(os.path.dirname(filepath)) and os.path.dirname(filepath) != '':
|
19 |
print(f"Warning: Directory '{os.path.dirname(filepath)}' not found.")
|
20 |
-
|
|
|
|
|
21 |
try:
|
22 |
try:
|
23 |
with open(filepath, 'r', encoding='utf-8') as f:
|
@@ -322,90 +365,105 @@ with gr.Blocks() as longform:
|
|
322 |
concurrency_limit=4)
|
323 |
|
324 |
# --- User Guide / Info Tab (Reformatted User Text) ---
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
The data used for the english model is rough and pretty tough for any TTS model (think debates, real conversations, plus a little bit of cleaner professional performances). It mostly comes from public sources or third parties (no TOS signed). I'll probably write a blog post later with more details
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
|
|
|
|
|
|
|
|
|
|
358 |
"""
|
359 |
|
360 |
with gr.Blocks() as info_tab:
|
361 |
-
gr.
|
362 |
|
363 |
# --- Model Details Tab (Reformatted User Text) ---
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
"""
|
406 |
|
407 |
with gr.Blocks() as model_details_tab:
|
408 |
-
gr.
|
409 |
|
410 |
|
411 |
theme = gr.themes.Base(
|
@@ -422,4 +480,8 @@ app = gr.TabbedInterface(
|
|
422 |
|
423 |
if __name__ == "__main__":
|
424 |
print("Launching Client Gradio App...")
|
|
|
|
|
|
|
|
|
425 |
app.queue(api_open=False, max_size=15).launch(show_api=False, share=True)
|
|
|
1 |
+
# client_app.py
|
2 |
import gradio as gr
|
3 |
import random
|
4 |
import os
|
5 |
import re
|
6 |
from gradio_client import Client, file
|
7 |
|
8 |
+
# --- Client Setup ---
|
9 |
+
# Try connecting using environment variables first
|
10 |
+
client_url = "http://127.0.0.1:7860" # Default if no env var
|
11 |
+
try:
|
12 |
+
# Use 'src' if defined (common for HF Spaces)
|
13 |
+
if 'src' in os.environ:
|
14 |
+
client_url = os.environ['src']
|
15 |
+
print(f"Connecting to client URL (src): {client_url}")
|
16 |
+
client = Client(client_url)
|
17 |
+
# Fallback to host/key if src isn't defined but host/key are
|
18 |
+
elif 'host' in os.environ and 'key' in os.environ:
|
19 |
+
client_url = os.environ['host']
|
20 |
+
api_key = os.environ['key']
|
21 |
+
print(f"Connecting to client URL (host): {client_url} using API key.")
|
22 |
+
# Note: gradio_client might expect hf_token for private spaces
|
23 |
+
client = Client(client_url, hf_token=api_key) # Assuming key is hf_token
|
24 |
+
# Fallback to just host if only host is defined
|
25 |
+
elif 'host' in os.environ:
|
26 |
+
client_url = os.environ['host']
|
27 |
+
print(f"Connecting to client URL (host): {client_url} (public/no key)")
|
28 |
+
client = Client(client_url)
|
29 |
+
# Fallback to the hardcoded default if no relevant env vars found
|
30 |
+
else:
|
31 |
+
print(f"No suitable environment variables (src, host/key, host) found.")
|
32 |
+
print(f"Attempting connection to default URL: {client_url}")
|
33 |
+
client = Client(client_url) # Use the default
|
34 |
|
35 |
+
print("Gradio Client connected successfully.")
|
36 |
+
# Optional: Check API endpoints
|
37 |
+
# print(client.view_api(print_info=True))
|
38 |
+
except Exception as e:
|
39 |
+
print(f"Error connecting Gradio Client to {client_url}: {e}")
|
40 |
+
print("Please ensure the source Gradio app is running and the URL/credentials are correct.")
|
41 |
+
# Provide a dummy client to prevent crashes
|
42 |
+
class DummyClient:
|
43 |
+
def predict(*args, **kwargs):
|
44 |
+
print("!!! Gradio Client not connected. Prediction will fail. !!!")
|
45 |
+
import numpy as np
|
46 |
+
return (44100, np.zeros(1)) # Sample rate, empty numpy array
|
47 |
+
client = DummyClient()
|
48 |
+
|
49 |
+
|
50 |
+
# --- UI Data Loading (Client-Side) ---
|
51 |
BASE_PATH = "Inference"
|
52 |
RU_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "random_texts.txt")
|
53 |
EN_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "english_random_texts.txt")
|
|
|
58 |
def load_texts(filepath):
|
59 |
if not os.path.exists(os.path.dirname(filepath)) and os.path.dirname(filepath) != '':
|
60 |
print(f"Warning: Directory '{os.path.dirname(filepath)}' not found.")
|
61 |
+
# Return a more specific default based on expected content type
|
62 |
+
if "random" in filepath: return ["Default example text."]
|
63 |
+
else: return ["Speaker: Default prompt text."]
|
64 |
try:
|
65 |
try:
|
66 |
with open(filepath, 'r', encoding='utf-8') as f:
|
|
|
365 |
concurrency_limit=4)
|
366 |
|
367 |
# --- User Guide / Info Tab (Reformatted User Text) ---
|
368 |
+
# Convert Markdown-like text to basic HTML for styling
|
369 |
+
user_guide_html = f"""
|
370 |
+
<div style="background-color: rgba(30, 30, 30, 0.9); color: #f0f0f0; padding: 20px; border-radius: 10px; border: 1px solid #444;">
|
371 |
+
<h2 style="border-bottom: 1px solid #555; padding-bottom: 5px;">Quick Notes:</h2>
|
372 |
+
<p>Everything in this demo & the repo (coming soon) is experimental. The main idea is just playing around with different things to see what works when you're limited to training on a pair of RTX 3090s.</p>
|
373 |
+
<p>The data used for the english model is rough and pretty tough for any TTS model (think debates, real conversations, plus a little bit of cleaner professional performances). It mostly comes from public sources or third parties (no TOS signed). I'll probably write a blog post later with more details.</p>
|
374 |
+
<p>So far I focused on English and Russian, more can be covered.</p>
|
375 |
+
|
376 |
+
<hr style="border-color: #555; margin: 15px 0;">
|
377 |
+
|
378 |
+
<h3 style="color: #a3ffc3;">Voice-Guided Tab (Using Audio Reference)</h3>
|
379 |
+
<h4>Options:</h4>
|
380 |
+
<ul>
|
381 |
+
<li><b>Default Voices:</b> Pick one from the dropdown (these are stored locally).</li>
|
382 |
+
<li><b>Upload Audio:</b> While the data isn't nearly enough for zero-shotting, you can still test your own samples. Make sure to decrease the beta if it didn't sound similar.</li>
|
383 |
+
<li><b>Speaker ID:</b> Use a number (RU: 0-196, EN: 0-3250) to grab a random clip of that speaker from the server's dataset. Hit 'Randomize' to explore. (Invalid IDs use a default voice on the server).</li>
|
384 |
+
</ul>
|
385 |
+
<h4>Some notes:</h4>
|
386 |
+
<ul>
|
387 |
+
<li><b>Not all speakers are equal.</b> Randomized samples might give you a poor reference sometimes.</li>
|
388 |
+
<li><b>Play with Beta:</b> Values from 0.2 to 0.9 can work well. Higher Beta = LESS like the reference. It works great for some voices, breaks others. Please play with different values. (0 = diffusion off).</li>
|
389 |
+
</ul>
|
390 |
+
|
391 |
+
<hr style="border-color: #555; margin: 15px 0;">
|
392 |
+
|
393 |
+
<h3 style="color: #a3ffc3;">Text-Guided Tab (Using Text Meaning)</h3>
|
394 |
+
<ul>
|
395 |
+
<li><b>Intuition:</b> Figure out the voice style just from the text itself (using semantic encoders). No audio needed, which makes suitable for real-time use cases.</li>
|
396 |
+
<li><b>Speaker Prefix:</b> For Russian, you can use 'Speaker_ + number:'. As for the English, you can use any names. Names were randomly assigned during the training of the Encoder.</li>
|
397 |
+
</ul>
|
398 |
+
|
399 |
+
<hr style="border-color: #555; margin: 15px 0;">
|
400 |
+
|
401 |
+
<h3 style="color: #a3ffc3;">General Tips</h3>
|
402 |
+
<ul>
|
403 |
+
<li>Punctuation matters for intonation; don't use unsupported symbols.</li>
|
404 |
+
</ul>
|
405 |
+
</div>
|
406 |
"""
|
407 |
|
408 |
with gr.Blocks() as info_tab:
|
409 |
+
gr.HTML(user_guide_html) # Use HTML component
|
410 |
|
411 |
# --- Model Details Tab (Reformatted User Text) ---
|
412 |
+
# Convert Markdown-like text to basic HTML for styling
|
413 |
+
model_details_html = """
|
414 |
+
<div style="background-color: rgba(30, 30, 30, 0.9); color: #f0f0f0; padding: 20px; border-radius: 10px; border: 1px solid #444;">
|
415 |
+
<h2 style="border-bottom: 1px solid #555; padding-bottom: 5px;">Model Details (The Guts)</h2>
|
416 |
+
|
417 |
+
<hr style="border-color: #555; margin: 15px 0;">
|
418 |
+
|
419 |
+
<h3 style="color: #e972ab;">Darya (Russian Model) - More Stable</h3>
|
420 |
+
<p>Generally more controlled than the English one. That's also why in terms of acoustic quality it should sound much better.</p>
|
421 |
+
<ul>
|
422 |
+
<li><b>Setup:</b> Non-End-to-End (separate steps).</li>
|
423 |
+
<li><b>Components:</b>
|
424 |
+
<ul>
|
425 |
+
<li>Style Encoder: Conformer-based.</li>
|
426 |
+
<li>Duration Predictor: Conformer-based (with cross-attention).</li>
|
427 |
+
<li>Semantic Encoder: <code>RuModernBERT-base</code> (for text-guidance).</li>
|
428 |
+
<li>Diffusion Sampler: <b>None currently.</b></li>
|
429 |
+
</ul>
|
430 |
+
</li>
|
431 |
+
<li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a></li>
|
432 |
+
<li><b>Training:</b> ~200K steps on ~320 hours of Russian data (mix of conversation & narration, hundreds of speakers).</li>
|
433 |
+
<li><b>Size:</b> Lightweight (~< 200M params).</li>
|
434 |
+
<li><b>Specs:</b> 44.1kHz output, 128 mel bins.</li>
|
435 |
+
</ul>
|
436 |
+
|
437 |
+
<hr style="border-color: #555; margin: 15px 0;">
|
438 |
+
|
439 |
+
<h3 style="color: #e972ab;">Kalliope (English Model) - Wild</h3>
|
440 |
+
<p>More expressive potential, but also less predictable. Showed signs of overfitting on the noisy data.</p>
|
441 |
+
<ul>
|
442 |
+
<li><b>Setup:</b> Non-End-to-End.</li>
|
443 |
+
<li><b>Components:</b>
|
444 |
+
<ul>
|
445 |
+
<li>Style Encoder: Conformer-based.</li>
|
446 |
+
<li>Text Encoder: <code>ConvNextV2</code>.</li>
|
447 |
+
<li>Duration Predictor: Conformer-based (with cross-attention).</li>
|
448 |
+
<li>Acoustic Decoder: Conformer-based.</li>
|
449 |
+
<li>Semantic Encoder: <code>DeBERTa V3 Base</code> (for text-guided).</li>
|
450 |
+
<li>Diffusion Sampler: <b>Yes</b></li>
|
451 |
+
</ul>
|
452 |
+
</li>
|
453 |
+
<li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a>.</li>
|
454 |
+
<li><b>Training:</b> ~100K steps on ~300-400 hours of <i>very complex & noisy</i> English data (conversational, whisper, narration, wide emotion range).</li>
|
455 |
+
<li><b>Size:</b> Bigger (~1.2B params total, but not all active at once - training was surprisingly doable). Hidden dim 1024, Style vector 512.</li>
|
456 |
+
<li><b>Specs:</b> 44.1kHz output, 128 mel bins (but more than half the dataset were 22-24khz or even phone-call quality)</li>
|
457 |
+
</ul>
|
458 |
+
|
459 |
+
<hr style="border-color: #555; margin: 15px 0;">
|
460 |
+
|
461 |
+
<p><i>More details might show up in a blog post later.</i></p>
|
462 |
+
</div>
|
463 |
"""
|
464 |
|
465 |
with gr.Blocks() as model_details_tab:
|
466 |
+
gr.HTML(model_details_html) # Use HTML component
|
467 |
|
468 |
|
469 |
theme = gr.themes.Base(
|
|
|
480 |
|
481 |
if __name__ == "__main__":
|
482 |
print("Launching Client Gradio App...")
|
483 |
+
# Add error handling for DummyClient case during launch if desired
|
484 |
+
if isinstance(client, DummyClient):
|
485 |
+
print("\nWARNING: Gradio Client failed to connect. The app will launch, but synthesis will not work.")
|
486 |
+
print("Please ensure the backend server is running and accessible, then restart this client.\n")
|
487 |
app.queue(api_open=False, max_size=15).launch(show_api=False, share=True)
|