Respair commited on
Commit
19d2432
·
verified ·
1 Parent(s): fc78676

Update demo.py

Browse files
Files changed (1) hide show
  1. demo.py +16 -20
demo.py CHANGED
@@ -156,11 +156,7 @@ def Client_PromptedSynth_Text(text, beta, t, diffusion_steps, embedding_scale, r
156
 
157
 
158
  # Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/xxx) later
159
- INTROTXT = """#
160
- Demo for The Poor Man's TTS, this is run on a single RTX 3090. <br>
161
- These networks can only generate natural speech with correct intonations (i.e generating NSFW, non-speech sounds, stutters etc. doesn't work.) <br>
162
- Repo -> [Github](https://github.com/Respaired/Project_Kalliope)
163
- """
164
 
165
 
166
  with gr.Blocks() as audio_inf:
@@ -194,15 +190,15 @@ with gr.Blocks() as audio_inf:
194
 
195
  with gr.Accordion("Advanced Parameters", open=False):
196
  beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1,
197
- label="Beta (Diffusion Strength vs. Reference) - Kalliope Only",
198
  info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.",
199
  interactive=True)
200
  multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1,
201
- label="Diffusion Steps - Kalliope Only",
202
  info="More steps can improve quality but increase inference time.",
203
  interactive=True)
204
  embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
205
- label="Embedding Scale (Intensity) - Kalliope Only",
206
  info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
207
  interactive=True)
208
  rate_of_speech = gr.Slider(minimum=0.5, maximum=2,
@@ -214,7 +210,7 @@ with gr.Blocks() as audio_inf:
214
 
215
  t = gr.Slider(minimum=0.1, maximum=2, value=0.7, step=0.05,
216
  label="T (Duration / Temperature)",
217
- info="inflence of previous sentence on the current one - Kalliope Only",
218
  interactive=True)
219
 
220
  with gr.Column(scale=1):
@@ -265,20 +261,20 @@ with gr.Blocks() as longform:
265
 
266
  with gr.Accordion("Advanced Parameters", open=False):
267
  beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
268
- label="Beta (Diffusion Strength vs. Semantic Encoder) - Kalliope Only",
269
  info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.",
270
  interactive=True)
271
- diffusion_steps_longform = gr.Slider(minimum=3, maximum=15, value=5, step=1,
272
- label="Diffusion Steps - Kalliope Only",
273
- info="More steps can improve quality but increase inference time.",
274
  interactive=True)
275
- embedding_scale_longform = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
276
- label="Embedding Scale (Intensity) - Kalliope Only",
277
- info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
278
  interactive=True)
279
  rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
280
  label="Rate of Speech",
281
- info="Adjusts speech speed. 1.0 is normal.",
282
  interactive=True)
283
  t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1,
284
  label="T (Style Consistency - Primarily English)",
@@ -385,7 +381,7 @@ model_details_html = """
385
  <li>Style Encoder: Conformer-based.</li>
386
  <li>Duration Predictor: Conformer-based (with cross-attention).</li>
387
  <li>Semantic Encoder: <code>RuModernBERT-base</code> (for text-guidance).</li>
388
- <li>Diffusion Sampler: <b>None currently.</b></li>
389
  </ul>
390
  </li>
391
  <li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a></li>
@@ -407,7 +403,7 @@ model_details_html = """
407
  <li>Duration Predictor: Conformer-based (with cross-attention).</li>
408
  <li>Acoustic Decoder: Conformer-based.</li>
409
  <li>Semantic Encoder: <code>DeBERTa V3 Base</code> (for text-guided).</li>
410
- <li>Diffusion Sampler: <b>Yes</b></li>
411
  </ul>
412
  </li>
413
  <li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a>.</li>
@@ -446,7 +442,7 @@ with gr.Blocks() as model_details_tab:
446
 
447
  with gr.Blocks(title="The Poor Man's TTS (Experimental 🔧)", theme="Respair/[email protected]") as demo:
448
  # gr.DuplicateButton("Duplicate Space")
449
- # gr.Markdown(INTROTXT)
450
 
451
 
452
  gr.TabbedInterface(
 
156
 
157
 
158
  # Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/xxx) later
159
+ INTROTXT = """Update v0.01: Darya (RU) now supports style diffusion as well. """
 
 
 
 
160
 
161
 
162
  with gr.Blocks() as audio_inf:
 
190
 
191
  with gr.Accordion("Advanced Parameters", open=False):
192
  beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1,
193
+ label="Beta (Diffusion Strength vs. Reference)",
194
  info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.",
195
  interactive=True)
196
  multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1,
197
+ label="Diffusion Steps",
198
  info="More steps can improve quality but increase inference time.",
199
  interactive=True)
200
  embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
201
+ label="Embedding Scale (Intensity)",
202
  info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
203
  interactive=True)
204
  rate_of_speech = gr.Slider(minimum=0.5, maximum=2,
 
210
 
211
  t = gr.Slider(minimum=0.1, maximum=2, value=0.7, step=0.05,
212
  label="T (Duration / Temperature)",
213
+ info="inflence of previous sentence on the current one",
214
  interactive=True)
215
 
216
  with gr.Column(scale=1):
 
261
 
262
  with gr.Accordion("Advanced Parameters", open=False):
263
  beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
264
+ label="Beta (Diffusion Strength vs. Semantic Encoder)",
265
  info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.",
266
  interactive=True)
267
+ diffusion_steps_longform = gr.Slider(minimum=3, maximum=50, value=5, step=1,
268
+ label="Diffusion Steps",
269
+ info="More steps can improve diversity but increase inference time, it won't necessarily make it better.",
270
  interactive=True)
271
+ embedding_scale_longform = gr.Slider(minimum=1, maximum=10, value=1, step=0.1,
272
+ label="Embedding Scale (Intensity)",
273
+ info="Impacts expressiveness.",
274
  interactive=True)
275
  rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
276
  label="Rate of Speech",
277
+ info="Adjusts speech speed. 1.0 is normal. it may not respond to tiny adjustments.",
278
  interactive=True)
279
  t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1,
280
  label="T (Style Consistency - Primarily English)",
 
381
  <li>Style Encoder: Conformer-based.</li>
382
  <li>Duration Predictor: Conformer-based (with cross-attention).</li>
383
  <li>Semantic Encoder: <code>RuModernBERT-base</code> (for text-guidance).</li>
384
+ <li>Diffusion Sampler: <b>**Yes**.</b></li>
385
  </ul>
386
  </li>
387
  <li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a></li>
 
403
  <li>Duration Predictor: Conformer-based (with cross-attention).</li>
404
  <li>Acoustic Decoder: Conformer-based.</li>
405
  <li>Semantic Encoder: <code>DeBERTa V3 Base</code> (for text-guided).</li>
406
+ <li>Diffusion Sampler: <b>Yes.</b></li>
407
  </ul>
408
  </li>
409
  <li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a>.</li>
 
442
 
443
  with gr.Blocks(title="The Poor Man's TTS (Experimental 🔧)", theme="Respair/[email protected]") as demo:
444
  # gr.DuplicateButton("Duplicate Space")
445
+ gr.Markdown(INTROTXT)
446
 
447
 
448
  gr.TabbedInterface(