Spaces:

preston-cell
/

image-text-to-text

Sleeping

preston-cell commited on Mar 27

Commit

3ce024b

verified ·

1 Parent(s): 4c5db6e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,50 +1,35 @@
 import gradio as gr
 from transformers import pipeline
-from transformers import AutoConfig
-import numpy as np
-from generator import load_csm_1b
-import torchaudio
-# Load the configuration manually
-config = AutoConfig.from_pretrained("sesame/csm-1b")
-# Load the model with config
-generator = load_csm_1b(device="cpu", config=config)
 # Load image-to-text model
 captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
 def process_image(input_image):
     try:
-        # Generate caption
         caption = captioner(input_image)[0]['generated_text']
-        # Generate speech using CSM
-        audio = generator.generate(
-            text=caption,
-            speaker=0,
-            context=[],
-            max_audio_length_ms=10_000,
-        )
-        # Convert the audio tensor to NumPy for Gradio
-        audio_np = audio.unsqueeze(0).cpu().numpy()
-        return (audio_np, generator.sample_rate), caption
-    except Exception as e:
-        return str(e), "Error generating caption or speech."
-# Set up Gradio UI
-iface = gr.Interface(
-    fn=process_image,
-    inputs=gr.Image(type='pil', label="Upload Image"),
-    outputs=[
-        gr.Audio(type="numpy", label="Generated Speech"),
-        gr.Textbox(label="Generated Caption")
-    ],
-    title="🎙️ SeeSay with CSM",
-    description="Upload an image to generate a caption and hear it narrated using CSM."
-)
-iface.launch(share=True)

 import gradio as gr
 from transformers import pipeline
 # Load image-to-text model
 captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
 def process_image(input_image):
     try:
+        # Step 1: Generate caption
         caption = captioner(input_image)[0]['generated_text']
+        return caption
+    except Exception as e:
+        return str(e)
+# Set up Gradio app
+with gr.Blocks(fill_height=True) as demo:
+    with gr.Sidebar():
+        gr.Markdown("# SeeSay - Powered by Sesame CSM")
+        gr.Markdown("This Space extracts captions from images and generates expressive speech using CSM.")
+        gr.Markdown("Sign in with your Hugging Face account to access the model.")
+        button = gr.LoginButton("Sign in")
+    # Image Upload and Caption Generation
+    image_input = gr.Image(type="pil", label="Upload Image")
+    caption_output = gr.Textbox(label="Generated Caption")
+    # Speech Generation using CSM
+    with gr.Row():
+        gr.Markdown("### Speech Generation")
+        gr.load("models/sesame/csm-1b", accept_token=button, provider="hf-inference")
+    # Link input and output
+    image_input.change(fn=process_image, inputs=image_input, outputs=caption_output)
+demo.launch()