Spaces:

preston-cell
/

image-text-to-text

Running

preston-cell commited on Mar 27

Commit

005e1c9

verified ·

1 Parent(s): 5379d69

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,35 +1,43 @@
 import gradio as gr
-from transformers import pipeline
-# Load image-to-text model
-captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-def process_image(input_image):
-    try:
-        # Step 1: Generate caption
-        caption = captioner(input_image)[0]['generated_text']
-        return caption
-    except Exception as e:
-        return str(e)
-# Set up Gradio app
-with gr.Blocks(fill_height=True) as demo:
-    with gr.Sidebar():
-        gr.Markdown("# SeeSay - Powered by Sesame CSM")
-        gr.Markdown("This Space extracts captions from images and generates expressive speech using CSM.")
-        gr.Markdown("Sign in with your Hugging Face account to access the model.")
-        button = gr.LoginButton("Sign in")
-    # Image Upload and Caption Generation
-    image_input = gr.Image(type="pil", label="Upload Image")
-    caption_output = gr.Textbox(label="Generated Caption")
-    # Speech Generation using CSM
-    with gr.Row():
-        gr.Markdown("### Speech Generation")
-        gr.load("models/sesame/csm-1b", accept_token=button, provider="hf-inference")
-    # Link input and output
-    image_input.change(fn=process_image, inputs=image_input, outputs=caption_output)
-demo.launch()

 import gradio as gr
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+from datasets import load_dataset
+device = "cpu"
+torch_dtype = torch.float32
+# Load the Whisper model
+model_id = "openai/whisper-large-v3-turbo"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
+)
+model.to(device)
+processor = AutoProcessor.from_pretrained(model_id)
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    torch_dtype=torch_dtype,
+    device=device,
+)
+def transcribe(audio_path):
+    try:
+        # Transcribe the audio using Whisper
+        result = pipe(audio_path)
+        return result["text"]
+    except Exception as e:
+        return str(e)
+demo = gr.Interface(
+    fn=transcribe,
+    inputs=gr.Audio(type="filepath"),
+    outputs=gr.Textbox(),
+    title="Whisper Speech Recognition",
+    description="Upload an audio file to transcribe using Whisper large-v3-turbo."
+)
+demo.launch(share=True)