Update demo.py
Browse files
demo.py
CHANGED
@@ -156,11 +156,7 @@ def Client_PromptedSynth_Text(text, beta, t, diffusion_steps, embedding_scale, r
|
|
156 |
|
157 |
|
158 |
# Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/xxx) later
|
159 |
-
INTROTXT = """
|
160 |
-
Demo for The Poor Man's TTS, this is run on a single RTX 3090. <br>
|
161 |
-
These networks can only generate natural speech with correct intonations (i.e generating NSFW, non-speech sounds, stutters etc. doesn't work.) <br>
|
162 |
-
Repo -> [Github](https://github.com/Respaired/Project_Kalliope)
|
163 |
-
"""
|
164 |
|
165 |
|
166 |
with gr.Blocks() as audio_inf:
|
@@ -194,15 +190,15 @@ with gr.Blocks() as audio_inf:
|
|
194 |
|
195 |
with gr.Accordion("Advanced Parameters", open=False):
|
196 |
beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1,
|
197 |
-
label="Beta (Diffusion Strength vs. Reference)
|
198 |
info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.",
|
199 |
interactive=True)
|
200 |
multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1,
|
201 |
-
label="Diffusion Steps
|
202 |
info="More steps can improve quality but increase inference time.",
|
203 |
interactive=True)
|
204 |
embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
|
205 |
-
label="Embedding Scale (Intensity)
|
206 |
info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
|
207 |
interactive=True)
|
208 |
rate_of_speech = gr.Slider(minimum=0.5, maximum=2,
|
@@ -214,7 +210,7 @@ with gr.Blocks() as audio_inf:
|
|
214 |
|
215 |
t = gr.Slider(minimum=0.1, maximum=2, value=0.7, step=0.05,
|
216 |
label="T (Duration / Temperature)",
|
217 |
-
info="inflence of previous sentence on the current one
|
218 |
interactive=True)
|
219 |
|
220 |
with gr.Column(scale=1):
|
@@ -265,20 +261,20 @@ with gr.Blocks() as longform:
|
|
265 |
|
266 |
with gr.Accordion("Advanced Parameters", open=False):
|
267 |
beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
|
268 |
-
label="Beta (Diffusion Strength vs. Semantic Encoder)
|
269 |
info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.",
|
270 |
interactive=True)
|
271 |
-
diffusion_steps_longform = gr.Slider(minimum=3, maximum=
|
272 |
-
label="Diffusion Steps
|
273 |
-
info="More steps can improve
|
274 |
interactive=True)
|
275 |
-
embedding_scale_longform = gr.Slider(minimum=1, maximum=
|
276 |
-
label="Embedding Scale (Intensity)
|
277 |
-
info="Impacts expressiveness.
|
278 |
interactive=True)
|
279 |
rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
|
280 |
label="Rate of Speech",
|
281 |
-
info="Adjusts speech speed. 1.0 is normal.",
|
282 |
interactive=True)
|
283 |
t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1,
|
284 |
label="T (Style Consistency - Primarily English)",
|
@@ -385,7 +381,7 @@ model_details_html = """
|
|
385 |
<li>Style Encoder: Conformer-based.</li>
|
386 |
<li>Duration Predictor: Conformer-based (with cross-attention).</li>
|
387 |
<li>Semantic Encoder: <code>RuModernBERT-base</code> (for text-guidance).</li>
|
388 |
-
<li>Diffusion Sampler: <b
|
389 |
</ul>
|
390 |
</li>
|
391 |
<li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a></li>
|
@@ -407,7 +403,7 @@ model_details_html = """
|
|
407 |
<li>Duration Predictor: Conformer-based (with cross-attention).</li>
|
408 |
<li>Acoustic Decoder: Conformer-based.</li>
|
409 |
<li>Semantic Encoder: <code>DeBERTa V3 Base</code> (for text-guided).</li>
|
410 |
-
<li>Diffusion Sampler: <b>Yes
|
411 |
</ul>
|
412 |
</li>
|
413 |
<li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a>.</li>
|
@@ -446,7 +442,7 @@ with gr.Blocks() as model_details_tab:
|
|
446 |
|
447 |
with gr.Blocks(title="The Poor Man's TTS (Experimental 🔧)", theme="Respair/[email protected]") as demo:
|
448 |
# gr.DuplicateButton("Duplicate Space")
|
449 |
-
|
450 |
|
451 |
|
452 |
gr.TabbedInterface(
|
|
|
156 |
|
157 |
|
158 |
# Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/xxx) later
|
159 |
+
INTROTXT = """Update v0.01: Darya (RU) now supports style diffusion as well. """
|
|
|
|
|
|
|
|
|
160 |
|
161 |
|
162 |
with gr.Blocks() as audio_inf:
|
|
|
190 |
|
191 |
with gr.Accordion("Advanced Parameters", open=False):
|
192 |
beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1,
|
193 |
+
label="Beta (Diffusion Strength vs. Reference)",
|
194 |
info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.",
|
195 |
interactive=True)
|
196 |
multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1,
|
197 |
+
label="Diffusion Steps",
|
198 |
info="More steps can improve quality but increase inference time.",
|
199 |
interactive=True)
|
200 |
embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
|
201 |
+
label="Embedding Scale (Intensity)",
|
202 |
info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
|
203 |
interactive=True)
|
204 |
rate_of_speech = gr.Slider(minimum=0.5, maximum=2,
|
|
|
210 |
|
211 |
t = gr.Slider(minimum=0.1, maximum=2, value=0.7, step=0.05,
|
212 |
label="T (Duration / Temperature)",
|
213 |
+
info="inflence of previous sentence on the current one",
|
214 |
interactive=True)
|
215 |
|
216 |
with gr.Column(scale=1):
|
|
|
261 |
|
262 |
with gr.Accordion("Advanced Parameters", open=False):
|
263 |
beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
|
264 |
+
label="Beta (Diffusion Strength vs. Semantic Encoder)",
|
265 |
info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.",
|
266 |
interactive=True)
|
267 |
+
diffusion_steps_longform = gr.Slider(minimum=3, maximum=50, value=5, step=1,
|
268 |
+
label="Diffusion Steps",
|
269 |
+
info="More steps can improve diversity but increase inference time, it won't necessarily make it better.",
|
270 |
interactive=True)
|
271 |
+
embedding_scale_longform = gr.Slider(minimum=1, maximum=10, value=1, step=0.1,
|
272 |
+
label="Embedding Scale (Intensity)",
|
273 |
+
info="Impacts expressiveness.",
|
274 |
interactive=True)
|
275 |
rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
|
276 |
label="Rate of Speech",
|
277 |
+
info="Adjusts speech speed. 1.0 is normal. it may not respond to tiny adjustments.",
|
278 |
interactive=True)
|
279 |
t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1,
|
280 |
label="T (Style Consistency - Primarily English)",
|
|
|
381 |
<li>Style Encoder: Conformer-based.</li>
|
382 |
<li>Duration Predictor: Conformer-based (with cross-attention).</li>
|
383 |
<li>Semantic Encoder: <code>RuModernBERT-base</code> (for text-guidance).</li>
|
384 |
+
<li>Diffusion Sampler: <b>**Yes**.</b></li>
|
385 |
</ul>
|
386 |
</li>
|
387 |
<li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a></li>
|
|
|
403 |
<li>Duration Predictor: Conformer-based (with cross-attention).</li>
|
404 |
<li>Acoustic Decoder: Conformer-based.</li>
|
405 |
<li>Semantic Encoder: <code>DeBERTa V3 Base</code> (for text-guided).</li>
|
406 |
+
<li>Diffusion Sampler: <b>Yes.</b></li>
|
407 |
</ul>
|
408 |
</li>
|
409 |
<li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a>.</li>
|
|
|
442 |
|
443 |
with gr.Blocks(title="The Poor Man's TTS (Experimental 🔧)", theme="Respair/[email protected]") as demo:
|
444 |
# gr.DuplicateButton("Duplicate Space")
|
445 |
+
gr.Markdown(INTROTXT)
|
446 |
|
447 |
|
448 |
gr.TabbedInterface(
|