Spaces:

iamrobotbear
/

blip-vqa-gradio

Paused

File size: 3,155 Bytes

6ded388
5053a56
f69bea2
 
3bc78d3
fcd98ee
 
 
6fa10d1
f69bea2
6fa10d1
 
 
 
 
f69bea2
 
 
 
 
 
 
 
6ded388
f69bea2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ded388
87b83c0
 
 
 
 
 
 
 
 
 
 
6ded388
f69bea2
 
 
864ff4a
 
f69bea2
 
 
 
 
864ff4a

import gradio as gr
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
from PIL import Image

# Check for GPU availability and set the device variable accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the BLIP-2 model and processor
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
# Load model in int8 using bitsandbytes, and pass device_map='auto'
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b", load_in_8bit=True, device_map='auto'
)

def blip2_interface(image, prompted_caption_text, vqa_question, chat_context):
    # Prepare image input
    image_input = Image.fromarray(image).convert('RGB')
    inputs = processor(image_input, return_tensors="pt").to(device, torch.float16)
    
    # Image Captioning
    generated_ids = model.generate(**inputs, max_new_tokens=20)
    image_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    # Prompted Image Captioning
    inputs = processor(image_input, text=prompted_caption_text, return_tensors="pt").to(device, torch.float16)
    generated_ids = model.generate(**inputs, max_new_tokens=20)
    prompted_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    
    # Visual Question Answering (VQA)
    prompt = f"Question: {vqa_question} Answer:"
    inputs = processor(image_input, text=prompt, return_tensors="pt").to(device, torch.float16)
    generated_ids = model.generate(**inputs, max_new_tokens=10)
    vqa_answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    
    # Chat-based Prompting
    prompt = chat_context + " Answer:"
    inputs = processor(image_input, text=prompt, return_tensors="pt").to(device, torch.float16)
    generated_ids = model.generate(**inputs, max_new_tokens=10)
    chat_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    return image_caption, prompted_caption, vqa_answer, chat_response

# Define Gradio input components
image_input = gr.inputs.Image(type="numpy", label="Image Input")
prompted_caption_input = gr.inputs.Textbox(label="Prompted Caption Text")
vqa_question_input = gr.inputs.Textbox(label="VQA Question")
chat_context = gr.inputs.Textbox(label="Chat Context")

# Define Gradio output components with labels corresponding to the inputs
image_caption_result = gr.outputs.Textbox(label="Image Caption")
prompted_caption_result = gr.outputs.Textbox(label="Prompted Image Caption")
vqa_answer = gr.outputs.Textbox(label="VQA Answer")
chat_response = gr.outputs.Textbox(label="Chat Response")

# Create Gradio interface
iface = gr.Interface(
    fn=blip2_interface,
    inputs=[image_input, prompted_caption_input, vqa_question_input, chat_context],
    outputs=[image_caption_result, prompted_caption_result, vqa_answer, chat_response],
    title="BLIP-2 Image Captioning and VQA",
    description="Interact with the BLIP-2 model for image captioning, prompted image captioning, visual question answering, and chat-based prompting.",
)

if __name__ == "__main__":
    iface.launch()