Spaces:

preston-cell
/

image-text-to-text

Running

preston-cell commited on Mar 27

Commit

629e04f

verified ·

1 Parent(s): 26dbd13

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,44 +1,35 @@
 import gradio as gr
-import torch
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-from datasets import load_dataset
-device = "cpu"
-torch_dtype = torch.float32
-# Load the Whisper model
-model_id = "openai/whisper-large-v3-turbo"
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
-)
-model.to(device)
-processor = AutoProcessor.from_pretrained(model_id)
-pipe = pipeline(
-    "text-to-speech",
-    model=model,
-    tokenizer=processor.tokenizer,
-    feature_extractor=processor.feature_extractor,
-    torch_dtype=torch_dtype,
-    device=device,
-)
-def generate_caption_and_speech(input_image):
-    # Generate caption
-    caption = captioner(input_image)[0]['generated_text']
-    # Generate speech from caption
-    audio_output = pipe(caption)
-    return (np.array(audio_output["audio"]), audio_output["sampling_rate"]), caption
-# Gradio Interface
 iface = gr.Interface(
     fn=generate_caption_and_speech,
     inputs=gr.Image(type='pil', label="Upload Image"),
-    outputs=[gr.Audio(type="numpy", label="Generated Audio"), gr.Textbox(label="Generated Caption")],
-    title="SeeSay",
-    description="Upload an image to generate a caption and hear it described with speech."")
-iface.launch(share=True)

 import gradio as gr
+from transformers import pipeline
+# Load the Whisper model for generating speech
+speech_model = pipeline("text-to-speech", model="openai/whisper-large-v3-turbo")
+# Load the BLIP model for image captioning
+caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+def generate_caption_and_speech(image):
+    try:
+        # Generate a caption from the image
+        caption = caption_model(image)[0]['generated_text']
+        # Generate speech using the caption
+        speech = speech_model(caption)
+        # Return both the caption and the audio
+        return speech["audio"], caption
+    except Exception as e:
+        return str(e), ""
+# Set up the Gradio interface
 iface = gr.Interface(
     fn=generate_caption_and_speech,
     inputs=gr.Image(type='pil', label="Upload Image"),
+    outputs=[
+        gr.Audio(type="filepath", label="Generated Audio"),
+        gr.Textbox(label="Generated Caption")
+    ],
+    title="SeeSay: Image to Speech",
+    description="Upload an image to generate a caption and hear it described with speech."
+)
+iface.launch(share=True)