Spaces:
Running
Running
import gradio as gr | |
from transformers import BlipProcessor, BlipForConditionalGeneration | |
from PIL import Image | |
import torch | |
# Load an advanced image captioning model with optimizations | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") | |
# Move model to GPU if available for faster inference | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
# Function to generate a detailed caption quickly | |
def generate_detailed_caption(image): | |
inputs = processor(image, return_tensors="pt").to(device) | |
out = model.generate( | |
**inputs, | |
max_length=75, # Slightly shorter for speed | |
num_beams=5, # Fewer beams for faster inference | |
repetition_penalty=1.8, | |
length_penalty=1.0, | |
no_repeat_ngram_size=2 | |
) | |
caption = processor.decode(out[0], skip_special_tokens=True) | |
return caption | |
# Gradio interface with webcam support | |
iface = gr.Interface( | |
fn=generate_detailed_caption, | |
inputs=[gr.Image(type="pil", label="Capture an image from webcam")], | |
outputs=gr.Textbox(label="Detailed Image Description"), | |
title="π· Fast Image Capture & Description App", | |
description="Capture an image using your webcam and let AI quickly generate a detailed description!", | |
live=True, | |
) | |
if __name__ == "__main__": | |
iface.launch() | |