Spaces:

ylacombe
/

children-story

Paused

App Files Files Community

ylacombe commited on Nov 3, 2023

Commit

cf600c8

1 Parent(s): e45093f

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -16

app.py CHANGED Viewed

@@ -47,9 +47,21 @@ text_client = InferenceClient(
 image_client = Client("https://openskyml-fast-sdxl-stable-diffusion-xl.hf.space/--replicas/ffe2bn2dk/")
 image_negative_prompt = "ultrarealistic, soft lighting, 8k, ugly, text, blurry"
 image_positive_prompt = ""
-image_seed = 9
 processor = AutoProcessor.from_pretrained("suno/bark")
 model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(device)
 sampling_rate = model.generation_config.sample_rate
 silence = np.zeros(int(0.25 * sampling_rate))  # quarter second of silence
@@ -125,7 +137,7 @@ def generate_story(
     return output
-def generate_audio_and_image(story_prompt, voice_preset=voice_preset):
     story = generate_story(story_prompt)
@@ -153,7 +165,7 @@ def generate_audio_and_image(story_prompt, voice_preset=voice_preset):
         inputs = model_input[BATCH_SIZE*i:min(BATCH_SIZE*(i+1), len(model_input))]
         if len(inputs) != 0:
-            inputs = processor(inputs, voice_preset=voice_preset)
             speech_output, output_lengths = model.generate(**inputs.to(device), return_output_lengths=True, min_eos_p=0.2)
@@ -163,9 +175,12 @@ def generate_audio_and_image(story_prompt, voice_preset=voice_preset):
             pieces += [*speech_output, silence.copy()]
     print("Calling image")
-    # TODO: if error catch it
-    img = job_img.result()
     return story, (sampling_rate, np.concatenate(pieces)), img
@@ -175,16 +190,16 @@ def generate_audio_and_image(story_prompt, voice_preset=voice_preset):
 # Gradio blocks demo
 with gr.Blocks() as demo_blocks:
     gr.Markdown("""<h1 align="center">🐶Children story</h1>""")
-    gr.HTML("""<h3 style="text-align:center;">📢Audio Streaming powered by Gradio (v3.40.0 onwards)🦾! </h3>""")
     with gr.Group():
       with gr.Row():
         inp_text = gr.Textbox(label="Story prompt", info="Enter text here")
-        #dd = gr.Dropdown(
-        #        speaker_embeddings,
-        #        value=None,
-        #        label="Available voice presets",
-        #        info="Defaults to no speaker embeddings!"
-        #        )
     with gr.Row():
@@ -197,8 +212,24 @@ with gr.Blocks() as demo_blocks:
         out_audio = gr.Audio(
                 streaming=False, autoplay=True) # needed to stream output audio
         out_text = gr.Text()
-        btn.click(generate_audio_and_image, [inp_text], [out_text, out_audio, image_output] ) #[out_audio]) #, out_count])
 demo_blocks.queue().launch(debug=True)

 image_client = Client("https://openskyml-fast-sdxl-stable-diffusion-xl.hf.space/--replicas/ffe2bn2dk/")
 image_negative_prompt = "ultrarealistic, soft lighting, 8k, ugly, text, blurry"
 image_positive_prompt = ""
+image_seed = 6
 processor = AutoProcessor.from_pretrained("suno/bark")
+def format_speaker_key(key):
+    key = key.replace("v2/", "").split("_")
+    return f"Speaker {key[2]} ({key[0]})"
+voice_presets = [key for key in processor.speaker_embeddings.keys() if "v2/en" in key]
+voice_presets_dict = {
+    format_speaker_key(key): key for key in voice_presets
+}
 model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(device)
 sampling_rate = model.generation_config.sample_rate
 silence = np.zeros(int(0.25 * sampling_rate))  # quarter second of silence
     return output
+def generate_audio_and_image(story_prompt, voice_preset="Speaker 6 (en)"):
     story = generate_story(story_prompt)
         inputs = model_input[BATCH_SIZE*i:min(BATCH_SIZE*(i+1), len(model_input))]
         if len(inputs) != 0:
+            inputs = processor(inputs, voice_preset=voice_presets_dict[voice_preset])
             speech_output, output_lengths = model.generate(**inputs.to(device), return_output_lengths=True, min_eos_p=0.2)
             pieces += [*speech_output, silence.copy()]
     print("Calling image")
+    try:
+        img = job_img.result()
+    except Exception as e:
+        print("Unhandled Exception: ", str(e))
+        gr.Warning("Unfortunately there was an issue when generating the image with SDXL.")
+        img = None
     return story, (sampling_rate, np.concatenate(pieces)), img
 # Gradio blocks demo
 with gr.Blocks() as demo_blocks:
     gr.Markdown("""<h1 align="center">🐶Children story</h1>""")
+    gr.HTML("""<h3 style="text-align:center;">Let Mistral tell you a story</h3>""")
     with gr.Group():
       with gr.Row():
         inp_text = gr.Textbox(label="Story prompt", info="Enter text here")
+        with gr.Accordion("Advanced settings", open=False):
+            voice_preset = gr.Dropdown(
+                    voice_presets_dict,
+                    value="Speaker 6 (en)",
+                    label="Available speakers",
+                    )
     with gr.Row():
         out_audio = gr.Audio(
                 streaming=False, autoplay=True) # needed to stream output audio
         out_text = gr.Text()
+        btn.click(generate_audio_and_image, [inp_text, voice_preset], [out_text, out_audio, image_output] ) #[out_audio]) #, out_count])
+    with gr.Row():
+        gr.Examples(
+        [
+            "A panda going on an adventure with a caterpillar. This is a story teaching a wonderful life lesson.",
+            "A princess breaks free from a dragon's grip. This evocates women empowerement and freedom."
+            "Tell me about the wonders of the world.",
+            ],
+        [inp_text],
+        [out_text, out_audio, image_output],
+        generate_audio_and_image,
+        cache_examples=True,
+    )
+        gr.Markdown(
+            """
+    This Space uses **[Bark](https://huggingface.co/docs/transformers/main/en/model_doc/bark)**, [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) and [Fast SD-XL](https://huggingface.co/spaces/openskyml/fast-sdxl-stable-diffusion-xl)!
+    """
+        )
 demo_blocks.queue().launch(debug=True)