Spaces:

Respair
/

Darya_TTS

Running

App Files Files Community

Respair commited on 14 days ago

Commit

19d2432

verified ·

1 Parent(s): fc78676

Update demo.py

Browse files

Files changed (1) hide show

demo.py +16 -20

demo.py CHANGED Viewed

@@ -156,11 +156,7 @@ def Client_PromptedSynth_Text(text, beta, t, diffusion_steps, embedding_scale, r
 # Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/xxx) later
-INTROTXT = """#
-Demo for The Poor Man's TTS, this is run on a single RTX 3090. <br>
-These networks can only generate natural speech with correct intonations (i.e generating NSFW, non-speech sounds, stutters etc. doesn't work.) <br>
-Repo -> [Github](https://github.com/Respaired/Project_Kalliope)
-"""
 with gr.Blocks() as audio_inf:
@@ -194,15 +190,15 @@ with gr.Blocks() as audio_inf:
             with gr.Accordion("Advanced Parameters", open=False):
                 beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1,
-                                 label="Beta (Diffusion Strength vs. Reference) - Kalliope Only",
                                  info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.",
                                  interactive=True)
                 multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1,
-                                              label="Diffusion Steps - Kalliope Only",
                                               info="More steps can improve quality but increase inference time.",
                                               interactive=True)
                 embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
-                                     label="Embedding Scale (Intensity) - Kalliope Only",
                                      info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
                                      interactive=True)
                 rate_of_speech = gr.Slider(minimum=0.5, maximum=2,
@@ -214,7 +210,7 @@ with gr.Blocks() as audio_inf:
                 t = gr.Slider(minimum=0.1, maximum=2, value=0.7, step=0.05,
                               label="T (Duration / Temperature)",
-                              info="inflence of previous sentence on the current one - Kalliope Only",
                               interactive=True)
         with gr.Column(scale=1):
@@ -265,20 +261,20 @@ with gr.Blocks() as longform:
             with gr.Accordion("Advanced Parameters", open=False):
                 beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
-                                          label="Beta (Diffusion Strength vs. Semantic Encoder) - Kalliope Only",
                                           info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.",
                                           interactive=True)
-                diffusion_steps_longform = gr.Slider(minimum=3, maximum=15, value=5, step=1,
-                                                     label="Diffusion Steps - Kalliope Only",
-                                                      info="More steps can improve quality but increase inference time.",
                                                      interactive=True)
-                embedding_scale_longform = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
-                                              label="Embedding Scale (Intensity) - Kalliope Only",
-                                              info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
                                               interactive=True)
                 rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
                                                     label="Rate of Speech",
-                                                    info="Adjusts speech speed. 1.0 is normal.",
                                                     interactive=True)
                 t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1,
                                         label="T (Style Consistency - Primarily English)",
@@ -385,7 +381,7 @@ model_details_html = """
                 <li>Style Encoder: Conformer-based.</li>
                 <li>Duration Predictor: Conformer-based (with cross-attention).</li>
                 <li>Semantic Encoder: <code>RuModernBERT-base</code> (for text-guidance).</li>
-                <li>Diffusion Sampler: <b>None currently.</b></li>
             </ul>
         </li>
         <li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a></li>
@@ -407,7 +403,7 @@ model_details_html = """
                 <li>Duration Predictor: Conformer-based (with cross-attention).</li>
                 <li>Acoustic Decoder: Conformer-based.</li>
                 <li>Semantic Encoder: <code>DeBERTa V3 Base</code> (for text-guided).</li>
-                <li>Diffusion Sampler: <b>Yes</b></li>
             </ul>
         </li>
         <li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a>.</li>
@@ -446,7 +442,7 @@ with gr.Blocks() as model_details_tab:
 with gr.Blocks(title="The Poor Man's TTS (Experimental 🔧)", theme="Respair/[email protected]") as demo:
     # gr.DuplicateButton("Duplicate Space")
-    # gr.Markdown(INTROTXT)
     gr.TabbedInterface(

 # Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/xxx) later
+INTROTXT = """Update v0.01: Darya (RU) now supports style diffusion as well. """
 with gr.Blocks() as audio_inf:
             with gr.Accordion("Advanced Parameters", open=False):
                 beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1,
+                                 label="Beta (Diffusion Strength vs. Reference)",
                                  info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.",
                                  interactive=True)
                 multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1,
+                                              label="Diffusion Steps",
                                               info="More steps can improve quality but increase inference time.",
                                               interactive=True)
                 embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
+                                     label="Embedding Scale (Intensity)",
                                      info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
                                      interactive=True)
                 rate_of_speech = gr.Slider(minimum=0.5, maximum=2,
                 t = gr.Slider(minimum=0.1, maximum=2, value=0.7, step=0.05,
                               label="T (Duration / Temperature)",
+                              info="inflence of previous sentence on the current one",
                               interactive=True)
         with gr.Column(scale=1):
             with gr.Accordion("Advanced Parameters", open=False):
                 beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
+                                          label="Beta (Diffusion Strength vs. Semantic Encoder)",
                                           info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.",
                                           interactive=True)
+                diffusion_steps_longform = gr.Slider(minimum=3, maximum=50, value=5, step=1,
+                                                     label="Diffusion Steps",
+                                                      info="More steps can improve diversity but increase inference time, it won't necessarily make it better.",
                                                      interactive=True)
+                embedding_scale_longform = gr.Slider(minimum=1, maximum=10, value=1, step=0.1,
+                                              label="Embedding Scale (Intensity)",
+                                              info="Impacts expressiveness.",
                                               interactive=True)
                 rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
                                                     label="Rate of Speech",
+                                                    info="Adjusts speech speed. 1.0 is normal. it may not respond to tiny adjustments.",
                                                     interactive=True)
                 t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1,
                                         label="T (Style Consistency - Primarily English)",
                 <li>Style Encoder: Conformer-based.</li>
                 <li>Duration Predictor: Conformer-based (with cross-attention).</li>
                 <li>Semantic Encoder: <code>RuModernBERT-base</code> (for text-guidance).</li>
+                <li>Diffusion Sampler: <b>**Yes**.</b></li>
             </ul>
         </li>
         <li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a></li>
                 <li>Duration Predictor: Conformer-based (with cross-attention).</li>
                 <li>Acoustic Decoder: Conformer-based.</li>
                 <li>Semantic Encoder: <code>DeBERTa V3 Base</code> (for text-guided).</li>
+                <li>Diffusion Sampler: <b>Yes.</b></li>
             </ul>
         </li>
         <li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a>.</li>
 with gr.Blocks(title="The Poor Man's TTS (Experimental 🔧)", theme="Respair/[email protected]") as demo:
     # gr.DuplicateButton("Duplicate Space")
+    gr.Markdown(INTROTXT)
     gr.TabbedInterface(