Spaces:

preston-cell
/

image-text-to-text

Running

preston-cell commited on Mar 21

Commit

520c499

verified ·

1 Parent(s): eade8cd

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,26 +6,26 @@ pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
 narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
-def launch(input):
-    # Step 1: Extract text from image
-    caption = pipe(input_image)[0]['generated_text']
-    # Step 2: Generate speech from the caption
     audio_output = narrator(caption)
-    # Step 3: Save the audio to a temporary file
-    audio_data = audio_output["audio"]
-    sampling_rate = audio_output["sampling_rate"]
-    # Gradio expects a tuple: (numpy_array, sampling_rate)
-    return (np.array(audio_data), sampling_rate)
-iface = gr.Interface(launch,
     fn=launch,
-    inputs=gr.Image(type='pil'),
-    outputs=gr.Audio(type="numpy", label="Narrated Output"),
     title="SeeSay",
     description="Upload an image to hear its context narrated aloud."
-                    )
-iface.launch()

 narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
+def launch(input_image):
+    # Step 1: Extract caption
+    caption = pipe(input_image)[0]["generated_text"]
+    # Step 2: Convert caption to audio
     audio_output = narrator(caption)
+    audio_array = np.array(audio_output["audio"])
+    sample_rate = audio_output["sampling_rate"]
+    # Step 3: Return audio + caption
+    return (audio_array, sample_rate), caption
+# Use dictionary to avoid conflicting argument ordering
+iface = gr.Interface(
     fn=launch,
+    inputs=gr.Image(type='pil', label="Upload Image"),
+    outputs=[
+        gr.Audio(type="numpy", label="Narrated Audio"),
+        gr.Textbox(label="Extracted Caption")
+    ],
     title="SeeSay",
     description="Upload an image to hear its context narrated aloud."
+)