Spaces:

preston-cell
/

image-text-to-text

Sleeping

preston-cell commited on Mar 27

Commit

26dbd13

verified ·

1 Parent(s): 005e1c9

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ model.to(device)
 processor = AutoProcessor.from_pretrained(model_id)
 pipe = pipeline(
-    "automatic-speech-recognition",
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
@@ -24,20 +24,21 @@ pipe = pipeline(
     device=device,
 )
-def transcribe(audio_path):
-    try:
-        # Transcribe the audio using Whisper
-        result = pipe(audio_path)
-        return result["text"]
-    except Exception as e:
-        return str(e)
-demo = gr.Interface(
-    fn=transcribe,
-    inputs=gr.Audio(type="filepath"),
-    outputs=gr.Textbox(),
-    title="Whisper Speech Recognition",
-    description="Upload an audio file to transcribe using Whisper large-v3-turbo."
-)
-demo.launch(share=True)

 processor = AutoProcessor.from_pretrained(model_id)
 pipe = pipeline(
+    "text-to-speech",
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
     device=device,
 )
+def generate_caption_and_speech(input_image):
+    # Generate caption
+    caption = captioner(input_image)[0]['generated_text']
+    # Generate speech from caption
+    audio_output = pipe(caption)
+    return (np.array(audio_output["audio"]), audio_output["sampling_rate"]), caption
+# Gradio Interface
+iface = gr.Interface(
+    fn=generate_caption_and_speech,
+    inputs=gr.Image(type='pil', label="Upload Image"),
+    outputs=[gr.Audio(type="numpy", label="Generated Audio"), gr.Textbox(label="Generated Caption")],
+    title="SeeSay",
+    description="Upload an image to generate a caption and hear it described with speech."")
+iface.launch(share=True)